[ci] check formatting (#64)

* [ci] check formatting

* [rustfmt] reformat the whole library
This commit is contained in:
gnzlbg 2017-10-27 17:55:29 +02:00 committed by Alex Crichton
parent 5869eca3e9
commit 69d2ad85f3
35 changed files with 2207 additions and 1405 deletions

View file

@ -17,9 +17,21 @@ matrix:
script: ci/run.sh
- install: true
script: ci/dox.sh
- env: RUSTFMT=On TARGET=x86_64-unknown-linux-gnu NO_ADD=1
script: |
cargo install rustfmt-nightly
cargo fmt -- --write-mode=diff
cd stdsimd
cargo fmt -- --write-mode=diff
cd assert-instr-macro
cargo fmt -- --write-mode=diff
cd ../simd-test-macro
cargo fmt -- --write-mode=diff
allow_failures:
- env: RUSTFMT=On TARGET=x86_64-unknown-linux-gnu NO_ADD=1
install:
- if [ "$NO_ADD" = "" ]; then rustup target add $TARGET; fi
- if [ "$NO_ADD" == "" ]; then rustup target add $TARGET; fi
script:
- cargo generate-lockfile

View file

@ -26,7 +26,8 @@ impl Frsqrt for f64x2 {
let u = unsafe {
vendor::_mm_rsqrt_ps(
f32x4::new(t.extract(0), t.extract(1), 0., 0.)).as_f64x4()
f32x4::new(t.extract(0), t.extract(1), 0., 0.),
).as_f64x4()
};
f64x2::new(u.extract(0), u.extract(1))
}
@ -36,11 +37,12 @@ impl Frsqrt for f64x2 {
use self::stdsimd::vendor;
unsafe { vendor::vrsqrte_f32(self.as_f32x2()).as_f64x2() }
}
#[cfg(not(any(all(any(target_arch = "x86", target_arch = "x86_64"),
#[cfg(not(any(all(any(target_arch = "x86",
target_arch = "x86_64"),
target_feature = "sse"),
all(any(target_arch = "arm", target_arch = "aarch64"),
target_feature = "neon")
)))]
all(any(target_arch = "arm",
target_arch = "aarch64"),
target_feature = "neon"))))]
{
self.replace(0, 1. / self.extract(0).sqrt());
self.replace(1, 1. / self.extract(1).sqrt());
@ -57,9 +59,9 @@ struct Body {
}
impl Body {
fn new(x0: f64, x1: f64, x2: f64,
v0: f64, v1: f64, v2: f64,
mass: f64) -> Body {
fn new(
x0: f64, x1: f64, x2: f64, v0: f64, v1: f64, v2: f64, mass: f64
) -> Body {
Body {
x: [x0, x1, x2],
_fill: 0.0,
@ -91,7 +93,7 @@ fn advance(bodies: &mut [Body; N_BODIES], dt: f64) {
let mut i = 0;
for j in 0..N_BODIES {
for k in j+1..N_BODIES {
for k in j + 1..N_BODIES {
for m in 0..3 {
r[i][m] = bodies[j].x[m] - bodies[k].x[m];
}
@ -102,14 +104,15 @@ fn advance(bodies: &mut [Body; N_BODIES], dt: f64) {
i = 0;
while i < N {
for m in 0..3 {
dx[m] = f64x2::new(r[i][m], r[i+1][m]);
dx[m] = f64x2::new(r[i][m], r[i + 1][m]);
}
dsquared = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
distance = dsquared.frsqrt();
for _ in 0..2 {
distance = distance * f64x2::splat(1.5) -
((f64x2::splat(0.5) * dsquared) * distance) * (distance * distance)
distance = distance * f64x2::splat(1.5)
- ((f64x2::splat(0.5) * dsquared) * distance)
* (distance * distance)
}
dmag = f64x2::splat(dt) / dsquared * distance;
dmag.store(&mut mag, i);
@ -119,7 +122,7 @@ fn advance(bodies: &mut [Body; N_BODIES], dt: f64) {
i = 0;
for j in 0..N_BODIES {
for k in j+1..N_BODIES {
for k in j + 1..N_BODIES {
for m in 0..3 {
bodies[j].v[m] -= r[i][m] * bodies[k].mass * mag[i];
bodies[k].v[m] += r[i][m] * bodies[j].mass * mag[i];
@ -138,15 +141,19 @@ fn energy(bodies: &[Body; N_BODIES]) -> f64 {
let mut e = 0.0;
for i in 0..N_BODIES {
let bi = &bodies[i];
e += bi.mass * (bi.v[0] * bi.v[0] + bi.v[1] * bi.v[1] + bi.v[2] * bi.v[2]) / 2.0;
for j in i+1..N_BODIES {
e += bi.mass
* (bi.v[0] * bi.v[0] + bi.v[1] * bi.v[1] + bi.v[2] * bi.v[2])
/ 2.0;
for j in i + 1..N_BODIES {
let bj = &bodies[j];
let mut dx = [0.0; 3];
for k in 0..3 {
dx[k] = bi.x[k] - bj.x[k];
}
let mut distance = 0.0;
for &d in &dx { distance += d * d }
for &d in &dx {
distance += d * d
}
e -= bi.mass * bj.mass / distance.sqrt()
}
}
@ -156,48 +163,54 @@ fn energy(bodies: &[Body; N_BODIES]) -> f64 {
fn main() {
let mut bodies: [Body; N_BODIES] = [
/* sun */
Body::new(0.0, 0.0, 0.0,
0.0, 0.0, 0.0,
SOLAR_MASS),
Body::new(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, SOLAR_MASS),
/* jupiter */
Body::new(4.84143144246472090e+00,
-1.16032004402742839e+00,
-1.03622044471123109e-01 ,
1.66007664274403694e-03 * DAYS_PER_YEAR,
7.69901118419740425e-03 * DAYS_PER_YEAR,
-6.90460016972063023e-05 * DAYS_PER_YEAR ,
9.54791938424326609e-04 * SOLAR_MASS
),
Body::new(
4.84143144246472090e+00,
-1.16032004402742839e+00,
-1.03622044471123109e-01,
1.66007664274403694e-03 * DAYS_PER_YEAR,
7.69901118419740425e-03 * DAYS_PER_YEAR,
-6.90460016972063023e-05 * DAYS_PER_YEAR,
9.54791938424326609e-04 * SOLAR_MASS,
),
/* saturn */
Body::new(8.34336671824457987e+00,
4.12479856412430479e+00,
-4.03523417114321381e-01 ,
-2.76742510726862411e-03 * DAYS_PER_YEAR,
4.99852801234917238e-03 * DAYS_PER_YEAR,
2.30417297573763929e-05 * DAYS_PER_YEAR ,
2.85885980666130812e-04 * SOLAR_MASS
),
Body::new(
8.34336671824457987e+00,
4.12479856412430479e+00,
-4.03523417114321381e-01,
-2.76742510726862411e-03 * DAYS_PER_YEAR,
4.99852801234917238e-03 * DAYS_PER_YEAR,
2.30417297573763929e-05 * DAYS_PER_YEAR,
2.85885980666130812e-04 * SOLAR_MASS,
),
/* uranus */
Body::new(1.28943695621391310e+01,
-1.51111514016986312e+01,
-2.23307578892655734e-01 ,
2.96460137564761618e-03 * DAYS_PER_YEAR,
2.37847173959480950e-03 * DAYS_PER_YEAR,
-2.96589568540237556e-05 * DAYS_PER_YEAR ,
4.36624404335156298e-05 * SOLAR_MASS
),
Body::new(
1.28943695621391310e+01,
-1.51111514016986312e+01,
-2.23307578892655734e-01,
2.96460137564761618e-03 * DAYS_PER_YEAR,
2.37847173959480950e-03 * DAYS_PER_YEAR,
-2.96589568540237556e-05 * DAYS_PER_YEAR,
4.36624404335156298e-05 * SOLAR_MASS,
),
/* neptune */
Body::new(1.53796971148509165e+01,
-2.59193146099879641e+01,
1.79258772950371181e-01 ,
2.68067772490389322e-03 * DAYS_PER_YEAR,
1.62824170038242295e-03 * DAYS_PER_YEAR,
-9.51592254519715870e-05 * DAYS_PER_YEAR ,
5.15138902046611451e-05 * SOLAR_MASS
)
];
Body::new(
1.53796971148509165e+01,
-2.59193146099879641e+01,
1.79258772950371181e-01,
2.68067772490389322e-03 * DAYS_PER_YEAR,
1.62824170038242295e-03 * DAYS_PER_YEAR,
-9.51592254519715870e-05 * DAYS_PER_YEAR,
5.15138902046611451e-05 * SOLAR_MASS,
),
];
let n: usize = std::env::args().nth(1).expect("need one arg").parse().unwrap();
let n: usize = std::env::args()
.nth(1)
.expect("need one arg")
.parse()
.unwrap();
offset_momentum(&mut bodies);
println!("{:.9}", energy(&bodies));

View file

@ -27,8 +27,12 @@ mod example {
unsafe {
vendor::_mm_cmpestri(
vneedle, needle_len as i32, vhaystack, hay_len as i32,
vendor::_SIDD_CMP_EQUAL_ORDERED) as usize
vneedle,
needle_len as i32,
vhaystack,
hay_len as i32,
vendor::_SIDD_CMP_EQUAL_ORDERED,
) as usize
}
}

View file

@ -0,0 +1,5 @@
max_width = 79
fn_call_width = 79
wrap_comments = true
error_on_line_overflow = false
fn_args_density = "Compressed"

View file

@ -3,7 +3,9 @@
//! The reference for NEON is [ARM's NEON Intrinsics Reference][arm_ref]. The
//! [ARM's NEON Intrinsics Online Database][arm_dat] is also useful.
//!
//! [arm_ref]: http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf
//! [arm_ref]:
//! http://infocenter.arm.com/help/topic/com.arm.doc.
//! ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf
//! [arm_dat]: https://developer.arm.com/technologies/neon/intrinsics
pub use self::v6::*;

View file

@ -1,7 +1,10 @@
//! ARMv6 intrinsics.
//!
//! The reference is [ARMv6-M Architecture Reference
//! Manual](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0419c/index.html).
//! The reference is [ARMv6-M Architecture Reference Manual][armv6m].
//!
//! [armv6m]:
//! http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0419c/index.
//! html
#[cfg(test)]
use stdsimd_test::assert_instr;
@ -27,16 +30,20 @@ mod tests {
#[test]
fn _rev_u16() {
unsafe {
assert_eq!(v6::_rev_u16(0b0000_0000_1111_1111_u16), 0b1111_1111_0000_0000_u16);
assert_eq!(
v6::_rev_u16(0b0000_0000_1111_1111_u16),
0b1111_1111_0000_0000_u16
);
}
}
#[test]
fn _rev_u32() {
unsafe {
assert_eq!(v6::_rev_u32(
0b0000_0000_1111_1111_0000_0000_1111_1111_u32
), 0b1111_1111_0000_0000_1111_1111_0000_0000_u32);
assert_eq!(
v6::_rev_u32(0b0000_0000_1111_1111_0000_0000_1111_1111_u32),
0b1111_1111_0000_0000_1111_1111_0000_0000_u32
);
}
}
}

View file

@ -1,7 +1,11 @@
//! ARMv7 intrinsics.
//!
//! The reference is [ARMv7-M Architecture Reference Manual (Issue
//! E.b)](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0403e.b/index.html).
//! E.b)][armv7m].
//!
//! [armv7m]:
//! http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0403e.
//! b/index.html
pub use super::v6::*;
@ -39,7 +43,7 @@ pub unsafe fn _rbit_u32(x: u32) -> u32 {
#[allow(dead_code)]
extern "C" {
#[link_name="llvm.bitreverse.i32"]
#[link_name = "llvm.bitreverse.i32"]
fn rbit_u32(i: i32) -> i32;
}
@ -72,8 +76,10 @@ mod tests {
#[test]
fn _rbit_u32() {
unsafe {
assert_eq!(v7::_rbit_u32(0b0000_1010u32),
0b0101_0000_0000_0000_0000_0000_0000_0000u32);
assert_eq!(
v7::_rbit_u32(0b0000_1010u32),
0b0101_0000_0000_0000_0000_0000_0000_0000u32
);
}
}
}

View file

@ -5,10 +5,8 @@ use stdsimd_test::assert_instr;
use simd_llvm::simd_add;
use v64::{i8x8, i16x4, i32x2,
u8x8, u16x4, u32x2, f32x2};
use v128::{i8x16, i16x8, i32x4, i64x2,
u8x16, u16x8, u32x4, u64x2, f32x4};
use v64::{f32x2, i16x4, i32x2, i8x8, u16x4, u32x2, u8x8};
use v128::{f32x4, i16x8, i32x4, i64x2, i8x16, u16x8, u32x4, u64x2, u8x16};
/// Vector add.
#[inline(always)]
@ -230,18 +228,9 @@ mod tests {
#[test]
fn vaddq_s8_() {
let a = i8x16::new(
1, 2, 3, 4, 5, 6, 7, 8,
1, 2, 3, 4, 5, 6, 7, 8,
);
let b = i8x16::new(
8, 7, 6, 5, 4, 3, 2, 1,
8, 7, 6, 5, 4, 3, 2, 1,
);
let e = i8x16::new(
9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9,
);
let a = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
let b = i8x16::new(8, 7, 6, 5, 4, 3, 2, 1, 8, 7, 6, 5, 4, 3, 2, 1);
let e = i8x16::new(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9);
let r = unsafe { vaddq_s8(a, b) };
assert_eq!(r, e);
}
@ -293,18 +282,9 @@ mod tests {
#[test]
fn vaddq_u8_() {
let a = u8x16::new(
1, 2, 3, 4, 5, 6, 7, 8,
1, 2, 3, 4, 5, 6, 7, 8,
);
let b = u8x16::new(
8, 7, 6, 5, 4, 3, 2, 1,
8, 7, 6, 5, 4, 3, 2, 1,
);
let e = u8x16::new(
9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9,
);
let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
let b = u8x16::new(8, 7, 6, 5, 4, 3, 2, 1, 8, 7, 6, 5, 4, 3, 2, 1);
let e = u8x16::new(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9);
let r = unsafe { vaddq_u8(a, b) };
assert_eq!(r, e);
}
@ -366,15 +346,9 @@ mod tests {
#[test]
fn vaddl_s8_() {
let v = ::std::i8::MAX;
let a = i8x8::new(
v, v, v, v,
v, v, v, v,
);
let a = i8x8::new(v, v, v, v, v, v, v, v);
let v = 2 * (v as i16);
let e = i16x8::new(
v, v, v, v,
v, v, v, v,
);
let e = i16x8::new(v, v, v, v, v, v, v, v);
let r = unsafe { vaddl_s8(a, a) };
assert_eq!(r, e);
}
@ -382,13 +356,9 @@ mod tests {
#[test]
fn vaddl_s16_() {
let v = ::std::i16::MAX;
let a = i16x4::new(
v, v, v, v,
);
let a = i16x4::new(v, v, v, v);
let v = 2 * (v as i32);
let e = i32x4::new(
v, v, v, v,
);
let e = i32x4::new(v, v, v, v);
let r = unsafe { vaddl_s16(a, a) };
assert_eq!(r, e);
}
@ -396,13 +366,9 @@ mod tests {
#[test]
fn vaddl_s32_() {
let v = ::std::i32::MAX;
let a = i32x2::new(
v, v,
);
let a = i32x2::new(v, v);
let v = 2 * (v as i64);
let e = i64x2::new(
v, v,
);
let e = i64x2::new(v, v);
let r = unsafe { vaddl_s32(a, a) };
assert_eq!(r, e);
}
@ -410,15 +376,9 @@ mod tests {
#[test]
fn vaddl_u8_() {
let v = ::std::u8::MAX;
let a = u8x8::new(
v, v, v, v,
v, v, v, v,
);
let a = u8x8::new(v, v, v, v, v, v, v, v);
let v = 2 * (v as u16);
let e = u16x8::new(
v, v, v, v,
v, v, v, v,
);
let e = u16x8::new(v, v, v, v, v, v, v, v);
let r = unsafe { vaddl_u8(a, a) };
assert_eq!(r, e);
}
@ -426,13 +386,9 @@ mod tests {
#[test]
fn vaddl_u16_() {
let v = ::std::u16::MAX;
let a = u16x4::new(
v, v, v, v,
);
let a = u16x4::new(v, v, v, v);
let v = 2 * (v as u32);
let e = u32x4::new(
v, v, v, v,
);
let e = u32x4::new(v, v, v, v);
let r = unsafe { vaddl_u16(a, a) };
assert_eq!(r, e);
}
@ -440,13 +396,9 @@ mod tests {
#[test]
fn vaddl_u32_() {
let v = ::std::u32::MAX;
let a = u32x2::new(
v, v,
);
let a = u32x2::new(v, v);
let v = 2 * (v as u64);
let e = u64x2::new(
v, v,
);
let e = u64x2::new(v, v);
let r = unsafe { vaddl_u32(a, a) };
assert_eq!(r, e);
}

View file

@ -1,6 +1,9 @@
//! ARMv8 intrinsics.
//!
//! The reference is [ARMv8-A Reference Manual](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0487a.k_10775/index.html).
//! The reference is [ARMv8-A Reference Manual][armv8].
//!
//! [armv8]: http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.
//! ddi0487a.k_10775/index.html
pub use super::v7::*;
@ -23,7 +26,7 @@ pub unsafe fn _clz_u64(x: u64) -> u64 {
#[allow(dead_code)]
extern "C" {
#[link_name="llvm.bitreverse.i64"]
#[link_name = "llvm.bitreverse.i64"]
fn rbit_u64(i: i64) -> i64;
}
@ -61,9 +64,10 @@ mod tests {
#[test]
fn _rev_u64() {
unsafe {
assert_eq!(v8::_rev_u64(
0b0000_0000_1111_1111_0000_0000_1111_1111_u64
), 0b1111_1111_0000_0000_1111_1111_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_u64);
assert_eq!(
v8::_rev_u64(0b0000_0000_1111_1111_0000_0000_1111_1111_u64),
0b1111_1111_0000_0000_1111_1111_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_u64
);
}
}
@ -77,27 +81,32 @@ mod tests {
#[test]
fn _rbit_u64() {
unsafe {
assert_eq!(v8::_rbit_u64(
0b0000_0000_1111_1101_0000_0000_1111_1111_u64
), 0b1111_1111_0000_0000_1011_1111_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_u64);
assert_eq!(
v8::_rbit_u64(0b0000_0000_1111_1101_0000_0000_1111_1111_u64),
0b1111_1111_0000_0000_1011_1111_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_u64
);
}
}
#[test]
fn _cls_u32() {
unsafe {
assert_eq!(v8::_cls_u32(
0b1111_1111_1111_1111_0000_0000_1111_1111_u32
), 15_u32);
assert_eq!(
v8::_cls_u32(0b1111_1111_1111_1111_0000_0000_1111_1111_u32),
15_u32
);
}
}
#[test]
fn _cls_u64() {
unsafe {
assert_eq!(v8::_cls_u64(
0b1111_1111_1111_1111_0000_0000_1111_1111_0000_0000_0000_0000_0000_0000_0000_0000_u64
), 15_u64);
assert_eq!(
v8::_cls_u64(
0b1111_1111_1111_1111_0000_0000_1111_1111_0000_0000_0000_0000_0000_0000_0000_0000_u64
),
15_u64
);
}
}
}

View file

@ -44,9 +44,9 @@
//! have no runtime support for whether you CPU actually supports the
//! instruction.
//!
//! CPU target feature detection is done via the `cfg_feature_enabled!` macro at
//! runtime. This macro will detect at runtime whether the specified feature is
//! available or not, returning true or false depending on the current CPU.
//! CPU target feature detection is done via the `cfg_feature_enabled!` macro
//! at runtime. This macro will detect at runtime whether the specified feature
//! is available or not, returning true or false depending on the current CPU.
//!
//! ```
//! #![feature(cfg_target_feature)]
@ -58,7 +58,8 @@
//! if cfg_feature_enabled!("avx2") {
//! println!("avx2 intrinsics will work");
//! } else {
//! println!("avx2 intrinsics will not work, they may generate SIGILL");
//! println!("avx2 intrinsics will not work");
//! // undefined behavior: may generate a `SIGILL`.
//! }
//! }
//! ```
@ -93,29 +94,33 @@
//!
//! # Status
//!
//! This crate is intended for eventual inclusion into the standard library, but
//! some work and experimentation is needed to get there! First and foremost you
//! can help out by kicking the tires on this crate and seeing if it works for
//! your use case! Next up you can help us fill out the [vendor
//! intrinsics][vendor] to ensure that we've got all the SIMD support necessary.
//! This crate is intended for eventual inclusion into the standard library,
//! but some work and experimentation is needed to get there! First and
//! foremost you can help out by kicking the tires on this crate and seeing if
//! it works for your use case! Next up you can help us fill out the [vendor
//! intrinsics][vendor] to ensure that we've got all the SIMD support
//! necessary.
//!
//! The language support and status of SIMD is also still a little up in the air
//! right now, you may be interested in a few issues along these lines:
//! The language support and status of SIMD is also still a little up in the
//! air right now, you may be interested in a few issues along these lines:
//!
//! * [Overal tracking issue for SIMD support](https://github.com/rust-lang/rust/issues/27731)
//! * [`cfg_target_feature` tracking issue](https://github.com/rust-lang/rust/issues/29717)
//! * [SIMD types currently not sound](https://github.com/rust-lang/rust/issues/44367)
//! * [`#[target_feature]` improvements](https://github.com/rust-lang/rust/issues/44839)
//! * [Overal tracking issue for SIMD support]
//! (https://github.com/rust-lang/rust/issues/27731)
//! * [`cfg_target_feature` tracking issue]
//! (https://github.com/rust-lang/rust/issues/29717)
//! * [SIMD types currently not sound]
//! (https://github.com/rust-lang/rust/issues/44367)
//! * [`#[target_feature]` improvements]
//! (https://github.com/rust-lang/rust/issues/44839)
//!
//! [vendor]: https://github.com/rust-lang-nursery/stdsimd/issues/40
#![cfg_attr(feature = "strict", deny(warnings))]
#![allow(dead_code)]
#![allow(unused_features)]
#![feature(
const_fn, link_llvm_intrinsics, platform_intrinsics, repr_simd, simd_ffi,
target_feature, cfg_target_feature, i128_type, asm, const_atomic_usize_new
)]
#![feature(const_fn, link_llvm_intrinsics, platform_intrinsics, repr_simd,
simd_ffi, target_feature, cfg_target_feature, i128_type, asm,
const_atomic_usize_new, stmt_expr_attributes)]
#![cfg_attr(test, feature(proc_macro, test))]
#[cfg(test)]

View file

@ -240,9 +240,11 @@ macro_rules! define_integer_ops {
i8, i16, i32, i64, isize);
impl ::std::fmt::LowerHex for $ty {
fn fmt(&self, f: &mut ::std::fmt::Formatter) -> ::std::fmt::Result {
fn fmt(&self, f: &mut ::std::fmt::Formatter)
-> ::std::fmt::Result {
write!(f, "{}(", stringify!($ty))?;
let n = ::std::mem::size_of_val(self) / ::std::mem::size_of::<$elem>();
let n = ::std::mem::size_of_val(self)
/ ::std::mem::size_of::<$elem>();
for i in 0..n {
if i > 0 {
write!(f, ", ")?;
@ -292,8 +294,7 @@ macro_rules! cfg_feature_enabled {
/// On ARM features are only detected at compile-time using
/// cfg(target_feature), so if this macro is executed the
/// feature is not supported.
#[cfg(any(target_arch = "arm",
target_arch = "aarch64"))]
#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
#[macro_export]
#[doc(hidden)]
macro_rules! __unstable_detect_feature {
@ -302,10 +303,8 @@ macro_rules! __unstable_detect_feature {
}
/// In all unsupported architectures using the macro is an error
#[cfg(not(any(target_arch = "x86",
target_arch = "x86_64",
target_arch = "arm",
target_arch = "aarch64")))]
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64",
target_arch = "arm", target_arch = "aarch64")))]
#[macro_export]
#[doc(hidden)]
macro_rules! __unstable_detect_feature {

View file

@ -50,7 +50,17 @@ define_from!(u8x16, u64x2, i64x2, u32x4, i32x4, u16x8, i16x8, i8x16);
define_from!(i8x16, u64x2, i64x2, u32x4, i32x4, u16x8, i16x8, u8x16);
define_common_ops!(
f64x2, f32x4, u64x2, i64x2, u32x4, i32x4, u16x8, i16x8, u8x16, i8x16);
f64x2,
f32x4,
u64x2,
i64x2,
u32x4,
i32x4,
u16x8,
i16x8,
u8x16,
i8x16
);
define_float_ops!(f64x2, f32x4);
define_integer_ops!(
(u64x2, u64),
@ -60,7 +70,8 @@ define_integer_ops!(
(u16x8, u16),
(i16x8, i16),
(u8x16, u8),
(i8x16, i8));
(i8x16, i8)
);
define_casts!(
(f64x2, f32x2, as_f32x2),
(f64x2, u64x2, as_u64x2),
@ -79,4 +90,5 @@ define_casts!(
(u16x8, i16x8, as_i16x8),
(i16x8, u16x8, as_u16x8),
(u8x16, i8x16, as_i8x16),
(i8x16, u8x16, as_u8x16));
(i8x16, u8x16, as_u8x16)
);

View file

@ -74,7 +74,17 @@ define_from!(u8x32, u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, i8x32);
define_from!(i8x32, u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, u8x32);
define_common_ops!(
f64x4, f32x8, u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, u8x32, i8x32);
f64x4,
f32x8,
u64x4,
i64x4,
u32x8,
i32x8,
u16x16,
i16x16,
u8x32,
i8x32
);
define_float_ops!(f64x4, f32x8);
define_integer_ops!(
(u64x4, u64),
@ -84,7 +94,8 @@ define_integer_ops!(
(u16x16, u16),
(i16x16, i16),
(u8x32, u8),
(i8x32, i8));
(i8x32, i8)
);
define_casts!(
(f64x4, f32x4, as_f32x4),
(f64x4, u64x4, as_u64x4),
@ -102,4 +113,5 @@ define_casts!(
(u16x16, i16x16, as_i16x16),
(i16x16, u16x16, as_u16x16),
(u8x32, i8x32, as_i8x32),
(i8x32, u8x32, as_u8x32));
(i8x32, u8x32, as_u8x32)
);

View file

@ -120,7 +120,17 @@ define_from!(u8x64, u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, i8x64);
define_from!(i8x64, u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, u8x64);
define_common_ops!(
f64x8, f32x16, u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, u8x64, i8x64);
f64x8,
f32x16,
u64x8,
i64x8,
u32x16,
i32x16,
u16x32,
i16x32,
u8x64,
i8x64
);
define_float_ops!(f64x8, f32x16);
define_integer_ops!(
(u64x8, u64),
@ -130,7 +140,8 @@ define_integer_ops!(
(u16x32, u16),
(i16x32, i16),
(u8x64, u8),
(i8x64, i8));
(i8x64, i8)
);
define_casts!(
(f64x8, f32x8, as_f32x8),
(f64x8, u64x8, as_u64x8),
@ -148,5 +159,5 @@ define_casts!(
(u16x32, i16x32, as_i16x32),
(i16x32, u16x32, as_u16x32),
(u8x64, i8x64, as_i8x64),
(i8x64, u8x64, as_u8x64));
(i8x64, u8x64, as_u8x64)
);

View file

@ -42,7 +42,8 @@ define_integer_ops!(
(u16x4, u16),
(i16x4, i16),
(u8x8, u8),
(i8x8, i8));
(i8x8, i8)
);
define_casts!(
(f32x2, f64x2, as_f64x2),
(f32x2, u32x2, as_u32x2),
@ -61,5 +62,4 @@ define_casts!(
(u8x8, u16x8, as_u16x8),
(u16x4, u32x4, as_u32x4),
(u32x2, u64x2, as_u64x2)
);

View file

@ -4,11 +4,19 @@
//!
//! The references are:
//!
//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2: Instruction Set Reference, A-Z](http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf).
//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and System Instructions](http://support.amd.com/TechDocs/24594.pdf).
//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
//! Instruction Set Reference, A-Z][intel64_ref].
//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
//! System Instructions][amd64_ref].
//!
//! [Wikipedia](https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29)
//! provides a quick overview of the instructions available.
//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
//! available.
//!
//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
//! [wikipedia_bmi]:
//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.
//! 28Advanced_Bit_Manipulation.29
#[cfg(test)]
use stdsimd_test::assert_instr;
@ -19,7 +27,9 @@ use stdsimd_test::assert_instr;
#[inline(always)]
#[target_feature = "+lzcnt"]
#[cfg_attr(test, assert_instr(lzcnt))]
pub unsafe fn _lzcnt_u32(x: u32) -> u32 { x.leading_zeros() }
pub unsafe fn _lzcnt_u32(x: u32) -> u32 {
x.leading_zeros()
}
/// Counts the leading most significant zero bits.
///
@ -27,19 +37,25 @@ pub unsafe fn _lzcnt_u32(x: u32) -> u32 { x.leading_zeros() }
#[inline(always)]
#[target_feature = "+lzcnt"]
#[cfg_attr(test, assert_instr(lzcnt))]
pub unsafe fn _lzcnt_u64(x: u64) -> u64 { x.leading_zeros() as u64 }
pub unsafe fn _lzcnt_u64(x: u64) -> u64 {
x.leading_zeros() as u64
}
/// Counts the bits that are set.
#[inline(always)]
#[target_feature = "+popcnt"]
#[cfg_attr(test, assert_instr(popcnt))]
pub unsafe fn _popcnt32(x: u32) -> u32 { x.count_ones() }
pub unsafe fn _popcnt32(x: u32) -> u32 {
x.count_ones()
}
/// Counts the bits that are set.
#[inline(always)]
#[target_feature = "+popcnt"]
#[cfg_attr(test, assert_instr(popcnt))]
pub unsafe fn _popcnt64(x: u64) -> u64 { x.count_ones() as u64 }
pub unsafe fn _popcnt64(x: u64) -> u64 {
x.count_ones() as u64
}
#[cfg(test)]
mod tests {

View file

@ -18,7 +18,8 @@ pub unsafe fn _mm256_add_pd(a: f64x4, b: f64x4) -> f64x4 {
a + b
}
/// Add packed single-precision (32-bit) floating-point elements in `a` and `b`.
/// Add packed single-precision (32-bit) floating-point elements in `a` and
/// `b`.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vaddps))]
@ -26,7 +27,8 @@ pub unsafe fn _mm256_add_ps(a: f32x8, b: f32x8) -> f32x8 {
a + b
}
/// Compute the bitwise AND of a packed double-precision (64-bit) floating-point elements
/// Compute the bitwise AND of a packed double-precision (64-bit)
/// floating-point elements
/// in `a` and `b`.
#[inline(always)]
#[target_feature = "+avx"]
@ -39,7 +41,8 @@ pub unsafe fn _mm256_and_pd(a: f64x4, b: f64x4) -> f64x4 {
mem::transmute(a & b)
}
/// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`.
/// Compute the bitwise AND of packed single-precision (32-bit) floating-point
/// elements in `a` and `b`.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vandps))]
@ -49,7 +52,8 @@ pub unsafe fn _mm256_and_ps(a: f32x8, b: f32x8) -> f32x8 {
mem::transmute(a & b)
}
/// Compute the bitwise OR packed double-precision (64-bit) floating-point elements
/// Compute the bitwise OR packed double-precision (64-bit) floating-point
/// elements
/// in `a` and `b`.
#[inline(always)]
#[target_feature = "+avx"]
@ -62,7 +66,8 @@ pub unsafe fn _mm256_or_pd(a: f64x4, b: f64x4) -> f64x4 {
mem::transmute(a | b)
}
/// Compute the bitwise OR packed single-precision (32-bit) floating-point elements in `a` and `b`.
/// Compute the bitwise OR packed single-precision (32-bit) floating-point
/// elements in `a` and `b`.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vorps))]
@ -114,7 +119,8 @@ pub unsafe fn _mm256_shuffle_pd(a: f64x4, b: f64x4, imm8: i32) -> f64x4 {
}
}
/// Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in `a`
/// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
/// elements in `a`
/// and then AND with `b`.
#[inline(always)]
#[target_feature = "+avx"]
@ -126,7 +132,8 @@ pub unsafe fn _mm256_andnot_pd(a: f64x4, b: f64x4) -> f64x4 {
mem::transmute((!a) & b)
}
/// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a`
/// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
/// elements in `a`
/// and then AND with `b`.
#[inline(always)]
#[target_feature = "+avx"]
@ -146,8 +153,8 @@ pub unsafe fn _mm256_max_pd(a: f64x4, b: f64x4) -> f64x4 {
maxpd256(a, b)
}
/// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`,
/// and return packed maximum values
/// Compare packed single-precision (32-bit) floating-point elements in `a`
/// and `b`, and return packed maximum values
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vmaxps))]
@ -164,8 +171,8 @@ pub unsafe fn _mm256_min_pd(a: f64x4, b: f64x4) -> f64x4 {
minpd256(a, b)
}
/// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`,
/// and return packed minimum values
/// Compare packed single-precision (32-bit) floating-point elements in `a`
/// and `b`, and return packed minimum values
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vminps))]
@ -182,7 +189,8 @@ pub unsafe fn _mm256_mul_pd(a: f64x4, b: f64x4) -> f64x4 {
a * b
}
/// Add packed single-precision (32-bit) floating-point elements in `a` and `b`.
/// Add packed single-precision (32-bit) floating-point elements in `a` and
/// `b`.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vmulps))]
@ -266,8 +274,8 @@ pub unsafe fn _mm256_round_pd(a: f64x4, b: i32) -> f64x4 {
constify_imm8!(b, call)
}
/// Round packed double-precision (64-bit) floating point elements in `a` toward
/// positive infinity.
/// Round packed double-precision (64-bit) floating point elements in `a`
/// toward positive infinity.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vroundpd))]
@ -275,8 +283,8 @@ pub unsafe fn _mm256_ceil_pd(a: f64x4) -> f64x4 {
roundpd256(a, 0x02)
}
/// Round packed double-precision (64-bit) floating point elements in `a` toward
/// negative infinity.
/// Round packed double-precision (64-bit) floating point elements in `a`
/// toward negative infinity.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vroundpd))]
@ -292,9 +300,9 @@ pub unsafe fn _mm256_floor_pd(a: f64x4) -> f64x4 {
/// - `0x02`: Round up, toward positive infinity.
/// - `0x03`: Truncate the values.
///
/// For a complete list of options, check the LLVM docs:
/// For a complete list of options, check [the LLVM docs][llvm_docs].
///
/// https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vroundps, b = 0x00))]
@ -307,8 +315,8 @@ pub unsafe fn _mm256_round_ps(a: f32x8, b: i32) -> f32x8 {
constify_imm8!(b, call)
}
/// Round packed single-precision (32-bit) floating point elements in `a` toward
/// positive infinity.
/// Round packed single-precision (32-bit) floating point elements in `a`
/// toward positive infinity.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vroundps))]
@ -316,8 +324,8 @@ pub unsafe fn _mm256_ceil_ps(a: f32x8) -> f32x8 {
roundps256(a, 0x02)
}
/// Round packed single-precision (32-bit) floating point elements in `a` toward
/// negative infinity.
/// Round packed single-precision (32-bit) floating point elements in `a`
/// toward negative infinity.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vroundps))]
@ -606,7 +614,8 @@ pub unsafe fn _mm256_cmp_ps(a: f32x8, b: f32x8, imm8: u8) -> f32x8 {
/// Compare the lower double-precision (64-bit) floating-point element in
/// `a` and `b` based on the comparison operand specified by `imm8`,
/// store the result in the lower element of returned vector,
/// and copy the upper element from `a` to the upper element of returned vector.
/// and copy the upper element from `a` to the upper element of returned
/// vector.
#[inline(always)]
#[target_feature = "+avx,+sse2"]
#[cfg_attr(test, assert_instr(vcmpeqsd, imm8 = 0))] // TODO Validate vcmpsd
@ -811,7 +820,9 @@ pub unsafe fn _mm_permutevar_ps(a: f32x4, b: i32x4) -> f32x4 {
#[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
pub unsafe fn _mm256_permute_ps(a: f32x8, imm8: i32) -> f32x8 {
let imm8 = (imm8 & 0xFF) as u8;
const fn add4(x: u32) -> u32 { x + 4 }
const fn add4(x: u32) -> u32 {
x + 4
}
macro_rules! shuffle4 {
($a:expr, $b:expr, $c:expr, $d:expr) => {
simd_shuffle8(a, _mm256_undefined_ps(), [
@ -857,7 +868,7 @@ pub unsafe fn _mm256_permute_ps(a: f32x8, imm8: i32) -> f32x8 {
}
}
/// Shuffle single-precision (32-bit) floating-point elements in `a`
/// Shuffle single-precision (32-bit) floating-point elements in `a`
/// using the control in `imm8`.
#[inline(always)]
#[target_feature = "+avx,+sse"]
@ -1026,7 +1037,9 @@ pub unsafe fn _mm256_permute2f128_pd(a: f64x4, b: f64x4, imm8: i8) -> f64x4 {
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))]
pub unsafe fn _mm256_permute2f128_si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8 {
pub unsafe fn _mm256_permute2f128_si256(
a: i32x8, b: i32x8, imm8: i8
) -> i32x8 {
macro_rules! call {
($imm8:expr) => { vperm2f128si256(a, b, $imm8) }
}
@ -1110,7 +1123,9 @@ pub unsafe fn _mm256_insertf128_pd(a: f64x4, b: f64x2, imm8: i32) -> f64x4 {
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
pub unsafe fn _mm256_insertf128_si256(a: __m256i, b: __m128i, imm8: i32) -> __m256i {
pub unsafe fn _mm256_insertf128_si256(
a: __m256i, b: __m128i, imm8: i32
) -> __m256i {
let b = i64x4::from(_mm256_castsi128_si256(b));
let dst: i64x4 = match imm8 & 1 {
0 => simd_shuffle4(i64x4::from(a), b, [4, 5, 2, 3]),
@ -1166,7 +1181,8 @@ pub unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> f64x4 {
ptr::copy_nonoverlapping(
mem_addr as *const u8,
&mut dst as *mut f64x4 as *mut u8,
mem::size_of::<f64x4>());
mem::size_of::<f64x4>(),
);
dst
}
@ -1191,7 +1207,8 @@ pub unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> f32x8 {
ptr::copy_nonoverlapping(
mem_addr as *const u8,
&mut dst as *mut f32x8 as *mut u8,
mem::size_of::<f32x8>());
mem::size_of::<f32x8>(),
);
dst
}
@ -1215,12 +1232,13 @@ pub unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i {
ptr::copy_nonoverlapping(
mem_addr as *const u8,
&mut dst as *mut __m256i as *mut u8,
mem::size_of::<__m256i>());
mem::size_of::<__m256i>(),
);
dst
}
/// Store 256-bits of integer data from `a` into memory.
/// `mem_addr` does not need to be aligned on any particular boundary.
/// `mem_addr` does not need to be aligned on any particular boundary.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
@ -1234,7 +1252,7 @@ pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vmaskmovpd))]
pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: i64x4) -> f64x4 {
pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: i64x4) -> f64x4 {
maskloadpd256(mem_addr as *const i8, mask)
}
@ -1272,7 +1290,7 @@ pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: i64x2, a: f64x2) {
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vmaskmovps))]
pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: i32x8) -> f32x8 {
pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: i32x8) -> f32x8 {
maskloadps256(mem_addr as *const i8, mask)
}
@ -1592,7 +1610,8 @@ pub unsafe fn _mm_testnzc_ps(a: f32x4, b: f32x4) -> i32 {
}
/// Set each bit of the returned mask based on the most significant bit of the
/// corresponding packed double-precision (64-bit) floating-point element in `a`.
/// corresponding packed double-precision (64-bit) floating-point element in
/// `a`.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vmovmskpd))]
@ -1601,7 +1620,8 @@ pub unsafe fn _mm256_movemask_pd(a: f64x4) -> i32 {
}
/// Set each bit of the returned mask based on the most significant bit of the
/// corresponding packed single-precision (32-bit) floating-point element in `a`.
/// corresponding packed single-precision (32-bit) floating-point element in
/// `a`.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vmovmskps))]
@ -1646,8 +1666,9 @@ pub unsafe fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> f64x4 {
/// vector with the supplied values.
#[inline(always)]
#[target_feature = "+avx"]
pub unsafe fn _mm256_set_ps(a: f32, b: f32, c: f32, d: f32,
e: f32, f: f32, g: f32, h: f32) -> f32x8 {
pub unsafe fn _mm256_set_ps(
a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32
) -> f32x8 {
f32x8::new(h, g, f, e, d, c, b, a)
}
@ -1655,44 +1676,45 @@ pub unsafe fn _mm256_set_ps(a: f32, b: f32, c: f32, d: f32,
/// reverse order.
#[inline(always)]
#[target_feature = "+avx"]
pub unsafe fn _mm256_set_epi8(e00: i8, e01: i8, e02: i8, e03: i8,
e04: i8, e05: i8, e06: i8, e07: i8,
e08: i8, e09: i8, e10: i8, e11: i8,
e12: i8, e13: i8, e14: i8, e15: i8,
e16: i8, e17: i8, e18: i8, e19: i8,
e20: i8, e21: i8, e22: i8, e23: i8,
e24: i8, e25: i8, e26: i8, e27: i8,
e28: i8, e29: i8, e30: i8, e31: i8) -> i8x32 {
i8x32::new(e31, e30, e29, e28,
e27, e26, e25, e24,
e23, e22, e21, e20,
e19, e18, e17, e16,
e15, e14, e13, e12,
e11, e10, e09, e08,
e07, e06, e05, e04,
e03, e02, e01, e00)
pub unsafe fn _mm256_set_epi8(
e00: i8, e01: i8, e02: i8, e03: i8, e04: i8, e05: i8, e06: i8, e07: i8,
e08: i8, e09: i8, e10: i8, e11: i8, e12: i8, e13: i8, e14: i8, e15: i8,
e16: i8, e17: i8, e18: i8, e19: i8, e20: i8, e21: i8, e22: i8, e23: i8,
e24: i8, e25: i8, e26: i8, e27: i8, e28: i8, e29: i8, e30: i8, e31: i8,
) -> i8x32 {
#[cfg_attr(rustfmt, rustfmt_skip)]
i8x32::new(
e31, e30, e29, e28, e27, e26, e25, e24,
e23, e22, e21, e20, e19, e18, e17, e16,
e15, e14, e13, e12, e11, e10, e09, e08,
e07, e06, e05, e04, e03, e02, e01, e00,
)
}
/// Set packed 16-bit integers in returned vector with the supplied values.
#[inline(always)]
#[target_feature = "+avx"]
pub unsafe fn _mm256_set_epi16(e00: i16, e01: i16, e02: i16, e03: i16,
e04: i16, e05: i16, e06: i16, e07: i16,
e08: i16, e09: i16, e10: i16, e11: i16,
e12: i16, e13: i16, e14: i16, e15: i16) -> i16x16 {
i16x16::new(e15, e14, e13, e12,
e11, e10, e09, e08,
e07, e06, e05, e04,
e03, e02, e01, e00)
pub unsafe fn _mm256_set_epi16(
e00: i16, e01: i16, e02: i16, e03: i16, e04: i16, e05: i16, e06: i16,
e07: i16, e08: i16, e09: i16, e10: i16, e11: i16, e12: i16, e13: i16,
e14: i16, e15: i16,
) -> i16x16 {
#[cfg_attr(rustfmt, rustfmt_skip)]
i16x16::new(
e15, e14, e13, e12,
e11, e10, e09, e08,
e07, e06, e05, e04,
e03, e02, e01, e00,
)
}
/// Set packed 32-bit integers in returned vector with the supplied values.
#[inline(always)]
#[target_feature = "+avx"]
pub unsafe fn _mm256_set_epi32(e0: i32, e1: i32, e2: i32, e3: i32,
e4: i32, e5: i32, e6: i32, e7: i32) -> i32x8 {
i32x8::new(e7, e6, e5, e4,
e3, e2, e1, e0)
pub unsafe fn _mm256_set_epi32(
e0: i32, e1: i32, e2: i32, e3: i32, e4: i32, e5: i32, e6: i32, e7: i32
) -> i32x8 {
i32x8::new(e7, e6, e5, e4, e3, e2, e1, e0)
}
/// Set packed 64-bit integers in returned vector with the supplied values.
@ -1715,8 +1737,9 @@ pub unsafe fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> f64x4 {
/// vector with the supplied values in reverse order.
#[inline(always)]
#[target_feature = "+avx"]
pub unsafe fn _mm256_setr_ps(a: f32, b: f32, c: f32, d: f32,
e: f32, f: f32, g: f32, h: f32) -> f32x8 {
pub unsafe fn _mm256_setr_ps(
a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32
) -> f32x8 {
f32x8::new(a, b, c, d, e, f, g, h)
}
@ -1724,46 +1747,47 @@ pub unsafe fn _mm256_setr_ps(a: f32, b: f32, c: f32, d: f32,
/// reverse order.
#[inline(always)]
#[target_feature = "+avx"]
pub unsafe fn _mm256_setr_epi8(e00: i8, e01: i8, e02: i8, e03: i8,
e04: i8, e05: i8, e06: i8, e07: i8,
e08: i8, e09: i8, e10: i8, e11: i8,
e12: i8, e13: i8, e14: i8, e15: i8,
e16: i8, e17: i8, e18: i8, e19: i8,
e20: i8, e21: i8, e22: i8, e23: i8,
e24: i8, e25: i8, e26: i8, e27: i8,
e28: i8, e29: i8, e30: i8, e31: i8) -> i8x32 {
i8x32::new(e00, e01, e02, e03,
e04, e05, e06, e07,
e08, e09, e10, e11,
e12, e13, e14, e15,
e16, e17, e18, e19,
e20, e21, e22, e23,
e24, e25, e26, e27,
e28, e29, e30, e31)
pub unsafe fn _mm256_setr_epi8(
e00: i8, e01: i8, e02: i8, e03: i8, e04: i8, e05: i8, e06: i8, e07: i8,
e08: i8, e09: i8, e10: i8, e11: i8, e12: i8, e13: i8, e14: i8, e15: i8,
e16: i8, e17: i8, e18: i8, e19: i8, e20: i8, e21: i8, e22: i8, e23: i8,
e24: i8, e25: i8, e26: i8, e27: i8, e28: i8, e29: i8, e30: i8, e31: i8,
) -> i8x32 {
#[cfg_attr(rustfmt, rustfmt_skip)]
i8x32::new(
e00, e01, e02, e03, e04, e05, e06, e07,
e08, e09, e10, e11, e12, e13, e14, e15,
e16, e17, e18, e19, e20, e21, e22, e23,
e24, e25, e26, e27, e28, e29, e30, e31,
)
}
/// Set packed 16-bit integers in returned vector with the supplied values in
/// reverse order.
#[inline(always)]
#[target_feature = "+avx"]
pub unsafe fn _mm256_setr_epi16(e00: i16, e01: i16, e02: i16, e03: i16,
e04: i16, e05: i16, e06: i16, e07: i16,
e08: i16, e09: i16, e10: i16, e11: i16,
e12: i16, e13: i16, e14: i16, e15: i16) -> i16x16 {
i16x16::new(e00, e01, e02, e03,
e04, e05, e06, e07,
e08, e09, e10, e11,
e12, e13, e14, e15)
pub unsafe fn _mm256_setr_epi16(
e00: i16, e01: i16, e02: i16, e03: i16, e04: i16, e05: i16, e06: i16,
e07: i16, e08: i16, e09: i16, e10: i16, e11: i16, e12: i16, e13: i16,
e14: i16, e15: i16,
) -> i16x16 {
#[cfg_attr(rustfmt, rustfmt_skip)]
i16x16::new(
e00, e01, e02, e03,
e04, e05, e06, e07,
e08, e09, e10, e11,
e12, e13, e14, e15,
)
}
/// Set packed 32-bit integers in returned vector with the supplied values in
/// reverse order.
#[inline(always)]
#[target_feature = "+avx"]
pub unsafe fn _mm256_setr_epi32(e0: i32, e1: i32, e2: i32, e3: i32,
e4: i32, e5: i32, e6: i32, e7: i32) -> i32x8 {
i32x8::new(e0, e1, e2, e3,
e4, e5, e6, e7)
pub unsafe fn _mm256_setr_epi32(
e0: i32, e1: i32, e2: i32, e3: i32, e4: i32, e5: i32, e6: i32, e7: i32
) -> i32x8 {
i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7)
}
/// Set packed 64-bit integers in returned vector with the supplied values in
@ -1798,10 +1822,13 @@ pub unsafe fn _mm256_set1_ps(a: f32) -> f32x8 {
#[cfg_attr(test, assert_instr(vpshufb))]
#[cfg_attr(test, assert_instr(vinsertf128))]
pub unsafe fn _mm256_set1_epi8(a: i8) -> i8x32 {
i8x32::new(a, a, a, a, a, a, a, a,
a, a, a, a, a, a, a, a,
a, a, a, a, a, a, a, a,
a, a, a, a, a, a, a, a)
#[cfg_attr(rustfmt, rustfmt_skip)]
i8x32::new(
a, a, a, a, a, a, a, a,
a, a, a, a, a, a, a, a,
a, a, a, a, a, a, a, a,
a, a, a, a, a, a, a, a,
)
}
/// Broadcast 16-bit integer `a` to all all elements of returned vector.
@ -1811,8 +1838,7 @@ pub unsafe fn _mm256_set1_epi8(a: i8) -> i8x32 {
//#[cfg_attr(test, assert_instr(vpshufb))]
#[cfg_attr(test, assert_instr(vinsertf128))]
pub unsafe fn _mm256_set1_epi16(a: i16) -> i16x16 {
i16x16::new(a, a, a, a, a, a, a, a,
a, a, a, a, a, a, a, a)
i16x16::new(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
}
/// Broadcast 32-bit integer `a` to all elements of returned vector.
@ -1954,7 +1980,7 @@ pub unsafe fn _mm256_zextps128_ps256(a: f32x4) -> f32x8 {
#[target_feature = "+avx,+sse2"]
pub unsafe fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
use x86::sse2::_mm_setzero_si128;
let b = mem::transmute(_mm_setzero_si128());
let b = mem::transmute(_mm_setzero_si128());
let dst: i64x4 = simd_shuffle4(i64x2::from(a), b, [0, 1, 2, 3]);
__m256i::from(dst)
}
@ -2044,22 +2070,28 @@ pub unsafe fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
}
/// Load two 128-bit values (composed of 4 packed single-precision (32-bit)
/// floating-point elements) from memory, and combine them into a 256-bit value.
/// floating-point elements) from memory, and combine them into a 256-bit
/// value.
/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
#[inline(always)]
#[target_feature = "+avx,+sse"]
pub unsafe fn _mm256_loadu2_m128(hiaddr: *const f32, loaddr: *const f32) -> f32x8 {
pub unsafe fn _mm256_loadu2_m128(
hiaddr: *const f32, loaddr: *const f32
) -> f32x8 {
use x86::sse::_mm_loadu_ps;
let a = _mm256_castps128_ps256(_mm_loadu_ps(loaddr));
_mm256_insertf128_ps(a, _mm_loadu_ps(hiaddr), 1)
}
/// Load two 128-bit values (composed of 2 packed double-precision (64-bit)
/// floating-point elements) from memory, and combine them into a 256-bit value.
/// floating-point elements) from memory, and combine them into a 256-bit
/// value.
/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
#[inline(always)]
#[target_feature = "+avx,+sse2"]
pub unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> f64x4 {
pub unsafe fn _mm256_loadu2_m128d(
hiaddr: *const f64, loaddr: *const f64
) -> f64x4 {
use x86::sse2::_mm_loadu_pd;
let a = _mm256_castpd128_pd256(_mm_loadu_pd(loaddr));
_mm256_insertf128_pd(a, _mm_loadu_pd(hiaddr), 1)
@ -2070,7 +2102,9 @@ pub unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> f64
/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
#[inline(always)]
#[target_feature = "+avx,+sse2"]
pub unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i) -> __m256i {
pub unsafe fn _mm256_loadu2_m128i(
hiaddr: *const __m128i, loaddr: *const __m128i
) -> __m256i {
use x86::sse2::_mm_loadu_si128;
let a = _mm256_castsi128_si256(_mm_loadu_si128(loaddr));
_mm256_insertf128_si256(a, _mm_loadu_si128(hiaddr), 1)
@ -2082,7 +2116,9 @@ pub unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i
/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
#[inline(always)]
#[target_feature = "+avx,+sse"]
pub unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: f32x8) {
pub unsafe fn _mm256_storeu2_m128(
hiaddr: *mut f32, loaddr: *mut f32, a: f32x8
) {
use x86::sse::_mm_storeu_ps;
let lo = _mm256_castps256_ps128(a);
_mm_storeu_ps(loaddr, lo);
@ -2096,7 +2132,9 @@ pub unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: f32x8)
/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
#[inline(always)]
#[target_feature = "+avx,+sse2"]
pub unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: f64x4) {
pub unsafe fn _mm256_storeu2_m128d(
hiaddr: *mut f64, loaddr: *mut f64, a: f64x4
) {
use x86::sse2::_mm_storeu_pd;
let lo = _mm256_castpd256_pd128(a);
_mm_storeu_pd(loaddr, lo);
@ -2109,7 +2147,9 @@ pub unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: f64x4)
/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
#[inline(always)]
#[target_feature = "+avx,+sse2"]
pub unsafe fn _mm256_storeu2_m128i(hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i) {
pub unsafe fn _mm256_storeu2_m128i(
hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i
) {
use x86::sse2::_mm_storeu_si128;
let lo = _mm256_castsi256_si128(a);
_mm_storeu_si128(loaddr, lo);
@ -2265,9 +2305,9 @@ extern "C" {
#[cfg(test)]
mod tests {
use stdsimd_test::simd_test;
use test::black_box; // Used to inhibit constant-folding.
use test::black_box; // Used to inhibit constant-folding.
use v128::{f32x4, f64x2, i8x16, i32x4, i64x2};
use v128::{f32x4, f64x2, i32x4, i64x2, i8x16};
use v256::*;
use x86::avx;
use x86::{__m128i, __m256i};
@ -2428,7 +2468,7 @@ mod tests {
let a = f64x4::new(1., 2., 3., 4.);
let b = f64x4::new(5., 6., 7., 8.);
let r = avx::_mm256_sub_pd(a, b);
let e = f64x4::new(-4.,-4.,-4.,-4.);
let e = f64x4::new(-4., -4., -4., -4.);
assert_eq!(r, e);
}
@ -2504,7 +2544,7 @@ mod tests {
#[simd_test = "avx"]
unsafe fn _mm256_sqrt_pd() {
let a = f64x4::new(4., 9., 16., 25.);
let r = avx::_mm256_sqrt_pd(a, );
let r = avx::_mm256_sqrt_pd(a);
let e = f64x4::new(2., 3., 4., 5.);
assert_eq!(r, e);
}
@ -2561,7 +2601,10 @@ mod tests {
unsafe fn _mm256_blendv_ps() {
let a = f32x8::new(4., 9., 16., 25., 4., 9., 16., 25.);
let b = f32x8::new(4., 3., 2., 5., 8., 9., 64., 50.0);
let c = f32x8::new(0., 0., 0., 0., !0 as f32, !0 as f32, !0 as f32, !0 as f32);
#[cfg_attr(rustfmt, rustfmt_skip)]
let c = f32x8::new(
0., 0., 0., 0., !0 as f32, !0 as f32, !0 as f32, !0 as f32,
);
let r = avx::_mm256_blendv_ps(a, b, c);
let e = f32x8::new(4., 9., 16., 25., 8., 9., 64., 50.0);
assert_eq!(r, e);
@ -2572,7 +2615,8 @@ mod tests {
let a = f32x8::new(4., 9., 16., 25., 4., 9., 16., 25.);
let b = f32x8::new(4., 3., 2., 5., 8., 9., 64., 50.0);
let r = avx::_mm256_dp_ps(a, b, 0xFF);
let e = f32x8::new(200.0, 200.0, 200.0, 200.0, 2387., 2387., 2387., 2387.);
let e =
f32x8::new(200.0, 200.0, 200.0, 200.0, 2387., 2387., 2387., 2387.);
assert_eq!(r, e);
}
@ -2801,20 +2845,21 @@ mod tests {
#[simd_test = "avx"]
unsafe fn _mm256_extract_epi8() {
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = i8x32::new(
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32);
25, 26, 27, 28, 29, 30, 31, 32,
);
let r = avx::_mm256_extract_epi8(a, 0);
assert_eq!(r, 1);
}
#[simd_test = "avx"]
unsafe fn _mm256_extract_epi16() {
let a = i16x16::new(
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15);
let a =
i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let r = avx::_mm256_extract_epi16(a, 0);
assert_eq!(r, 0);
}
@ -3004,29 +3049,31 @@ mod tests {
#[simd_test = "avx"]
unsafe fn _mm256_insert_epi8() {
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = i8x32::new(
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32);
25, 26, 27, 28, 29, 30, 31, 32,
);
let r = avx::_mm256_insert_epi8(a, 0, 31);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = i8x32::new(
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 0);
25, 26, 27, 28, 29, 30, 31, 0,
);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_insert_epi16() {
let a = i16x16::new(
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15);
let a =
i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let r = avx::_mm256_insert_epi16(a, 0, 15);
let e = i16x16::new(
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 0);
let e =
i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0);
assert_eq!(r, e);
}
@ -3203,18 +3250,22 @@ mod tests {
#[simd_test = "avx"]
unsafe fn _mm256_lddqu_si256() {
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = i8x32::new(
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32);
25, 26, 27, 28, 29, 30, 31, 32,
);
let p = &a as *const _;
let r = avx::_mm256_lddqu_si256(black_box(p));
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = i8x32::new(
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32);
25, 26, 27, 28, 29, 30, 31, 32,
);
assert_eq!(r, e);
}
@ -3222,8 +3273,11 @@ mod tests {
unsafe fn _mm256_rcp_ps() {
let a = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.);
let r = avx::_mm256_rcp_ps(a);
let e = f32x8::new(0.99975586, 0.49987793, 0.33325195, 0.24993896,
0.19995117, 0.16662598, 0.14282227, 0.12496948);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = f32x8::new(
0.99975586, 0.49987793, 0.33325195, 0.24993896,
0.19995117, 0.16662598, 0.14282227, 0.12496948,
);
let rel_err = 0.00048828125;
for i in 0..8 {
assert_approx_eq!(r.extract(i), e.extract(i), 2. * rel_err);
@ -3234,8 +3288,11 @@ mod tests {
unsafe fn _mm256_rsqrt_ps() {
let a = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.);
let r = avx::_mm256_rsqrt_ps(a);
let e = f32x8::new(0.99975586, 0.7069092, 0.5772705, 0.49987793,
0.44714355, 0.40820313, 0.3779297, 0.3534546);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = f32x8::new(
0.99975586, 0.7069092, 0.5772705, 0.49987793,
0.44714355, 0.40820313, 0.3779297, 0.3534546,
);
let rel_err = 0.00048828125;
for i in 0..8 {
assert_approx_eq!(r.extract(i), e.extract(i), 2. * rel_err);
@ -3478,30 +3535,39 @@ mod tests {
#[simd_test = "avx"]
unsafe fn _mm256_set_epi8() {
#[cfg_attr(rustfmt, rustfmt_skip)]
let r = avx::_mm256_set_epi8(
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32);
assert_eq!(r, i8x32::new(32, 31, 30, 29, 28, 27, 26, 25,
24, 23, 22, 21, 20, 19, 18, 17,
16, 15, 14, 13, 12, 11, 10, 9,
8, 7, 6, 5, 4, 3, 2, 1));
25, 26, 27, 28, 29, 30, 31, 32,
);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = i8x32::new(
32, 31, 30, 29, 28, 27, 26, 25,
24, 23, 22, 21, 20, 19, 18, 17,
16, 15, 14, 13, 12, 11, 10, 9,
8, 7, 6, 5, 4, 3, 2, 1
);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_set_epi16() {
#[cfg_attr(rustfmt, rustfmt_skip)]
let r = avx::_mm256_set_epi16(
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16);
assert_eq!(r, i16x16::new(16, 15, 14, 13, 12, 11, 10, 9,
8, 7, 6, 5, 4, 3, 2, 1));
9, 10, 11, 12, 13, 14, 15, 16,
);
assert_eq!(
r,
i16x16::new(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)
);
}
#[simd_test = "avx"]
unsafe fn _mm256_set_epi32() {
let r = avx::_mm256_set_epi32(
1, 2, 3, 4, 5, 6, 7, 8);
let r = avx::_mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
assert_eq!(r, i32x8::new(8, 7, 6, 5, 4, 3, 2, 1));
}
@ -3525,30 +3591,40 @@ mod tests {
#[simd_test = "avx"]
unsafe fn _mm256_setr_epi8() {
#[cfg_attr(rustfmt, rustfmt_skip)]
let r = avx::_mm256_setr_epi8(
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32);
assert_eq!(r, i8x32::new(1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32));
25, 26, 27, 28, 29, 30, 31, 32,
);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = i8x32::new(
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32
);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_setr_epi16() {
#[cfg_attr(rustfmt, rustfmt_skip)]
let r = avx::_mm256_setr_epi16(
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16);
assert_eq!(r, i16x16::new(1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16));
9, 10, 11, 12, 13, 14, 15, 16,
);
assert_eq!(
r,
i16x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)
);
}
#[simd_test = "avx"]
unsafe fn _mm256_setr_epi32() {
let r = avx::_mm256_setr_epi32(
1, 2, 3, 4, 5, 6, 7, 8);
let r = avx::_mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
assert_eq!(r, i32x8::new(1, 2, 3, 4, 5, 6, 7, 8));
}
@ -3614,19 +3690,25 @@ mod tests {
unsafe fn _mm256_castps_si256() {
let a = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.);
let r = avx::_mm256_castps_si256(a);
let e = i8x32::new(0, 0, -128, 63, 0, 0, 0, 64,
0, 0, 64, 64, 0, 0, -128, 64,
0, 0, -96, 64, 0, 0, -64, 64,
0, 0, -32, 64, 0, 0, 0, 65);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = i8x32::new(
0, 0, -128, 63, 0, 0, 0, 64,
0, 0, 64, 64, 0, 0, -128, 64,
0, 0, -96, 64, 0, 0, -64, 64,
0, 0, -32, 64, 0, 0, 0, 65,
);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_castsi256_ps() {
let a = i8x32::new(0, 0, -128, 63, 0, 0, 0, 64,
0, 0, 64, 64, 0, 0, -128, 64,
0, 0, -96, 64, 0, 0, -64, 64,
0, 0, -32, 64, 0, 0, 0, 65);
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = i8x32::new(
0, 0, -128, 63, 0, 0, 0, 64,
0, 0, 64, 64, 0, 0, -128, 64,
0, 0, -96, 64, 0, 0, -64, 64,
0, 0, -32, 64, 0, 0, 0, 65,
);
let r = avx::_mm256_castsi256_ps(a);
let e = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.);
assert_eq!(r, e);
@ -3711,16 +3793,23 @@ mod tests {
#[simd_test = "avx"]
unsafe fn _mm256_set_m128i() {
let hi = i8x16::new(17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32);
let lo = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16);
#[cfg_attr(rustfmt, rustfmt_skip)]
let hi = i8x16::new(
17, 18, 19, 20,
21, 22, 23, 24,
25, 26, 27, 28,
29, 30, 31, 32,
);
let lo =
i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
let r = avx::_mm256_set_m128i(hi, lo);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = i8x32::new(
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32);
25, 26, 27, 28, 29, 30, 31, 32,
);
assert_eq!(r, e);
}
@ -3744,16 +3833,21 @@ mod tests {
#[simd_test = "avx"]
unsafe fn _mm256_setr_m128i() {
let lo = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16);
let hi = i8x16::new(17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32);
let lo =
i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
#[cfg_attr(rustfmt, rustfmt_skip)]
let hi = i8x16::new(
17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32,
);
let r = avx::_mm256_setr_m128i(lo, hi);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = i8x32::new(
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32);
25, 26, 27, 28, 29, 30, 31, 32,
);
assert_eq!(r, e);
}
@ -3781,17 +3875,24 @@ mod tests {
#[simd_test = "avx"]
unsafe fn _mm256_loadu2_m128i() {
let hi = i8x16::new(17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32);
let lo = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16);
let r = avx::_mm256_loadu2_m128i(&hi as *const _ as *const _,
&lo as *const _ as *const _);
#[cfg_attr(rustfmt, rustfmt_skip)]
let hi = i8x16::new(
17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32,
);
let lo =
i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
let r = avx::_mm256_loadu2_m128i(
&hi as *const _ as *const _,
&lo as *const _ as *const _,
);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = i8x32::new(
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32);
25, 26, 27, 28, 29, 30, 31, 32,
);
assert_eq!(r, e);
}
@ -3801,9 +3902,11 @@ mod tests {
let a = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.);
let mut hi = _mm_undefined_ps();
let mut lo = _mm_undefined_ps();
avx::_mm256_storeu2_m128(&mut hi as *mut _ as *mut f32,
&mut lo as *mut _ as *mut f32,
a);
avx::_mm256_storeu2_m128(
&mut hi as *mut _ as *mut f32,
&mut lo as *mut _ as *mut f32,
a,
);
assert_eq!(hi, f32x4::new(5., 6., 7., 8.));
assert_eq!(lo, f32x4::new(1., 2., 3., 4.));
}
@ -3814,9 +3917,11 @@ mod tests {
let a = f64x4::new(1., 2., 3., 4.);
let mut hi = _mm_undefined_pd();
let mut lo = _mm_undefined_pd();
avx::_mm256_storeu2_m128d(&mut hi as *mut _ as *mut f64,
&mut lo as *mut _ as *mut f64,
a);
avx::_mm256_storeu2_m128d(
&mut hi as *mut _ as *mut f64,
&mut lo as *mut _ as *mut f64,
a,
);
assert_eq!(hi, f64x2::new(3., 4.));
assert_eq!(lo, f64x2::new(1., 2.));
}
@ -3824,17 +3929,26 @@ mod tests {
#[simd_test = "avx"]
unsafe fn _mm256_storeu2_m128i() {
use x86::sse2::_mm_undefined_si128;
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = i8x32::new(
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32);
25, 26, 27, 28, 29, 30, 31, 32,
);
let mut hi = _mm_undefined_si128();
let mut lo = _mm_undefined_si128();
avx::_mm256_storeu2_m128i(&mut hi as *mut _, &mut lo as *mut _, a);
assert_eq!(hi, i8x16::new(17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32));
assert_eq!(lo, i8x16::new(1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16));
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = i8x16::new(
17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32
);
assert_eq!(hi, e);
assert_eq!(
lo,
i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)
);
}
}

View file

@ -96,8 +96,8 @@ pub unsafe fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 {
paddusw(a, b)
}
/// Concatenate pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary result,
/// shift the result right by `n` bytes, and return the low 16 bytes.
/// Concatenate pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
/// result, shift the result right by `n` bytes, and return the low 16 bytes.
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpalignr, n = 15))]
@ -116,7 +116,9 @@ pub unsafe fn _mm256_alignr_epi8(a: i8x32, b: i8x32, n: i32) -> i8x32 {
(a, b, n)
};
const fn add(a: u32, b: u32) -> u32 { a + b }
const fn add(a: u32, b: u32) -> u32 {
a + b
}
macro_rules! shuffle {
($shift:expr) => {
simd_shuffle32(b, a, [
@ -140,14 +142,22 @@ pub unsafe fn _mm256_alignr_epi8(a: i8x32, b: i8x32, n: i32) -> i8x32 {
}
}
match n {
0 => shuffle!(0), 1 => shuffle!(1),
2 => shuffle!(2), 3 => shuffle!(3),
4 => shuffle!(4), 5 => shuffle!(5),
6 => shuffle!(6), 7 => shuffle!(7),
8 => shuffle!(8), 9 => shuffle!(9),
10 => shuffle!(10), 11 => shuffle!(11),
12 => shuffle!(12), 13 => shuffle!(13),
14 => shuffle!(14), 15 => shuffle!(15),
0 => shuffle!(0),
1 => shuffle!(1),
2 => shuffle!(2),
3 => shuffle!(3),
4 => shuffle!(4),
5 => shuffle!(5),
6 => shuffle!(6),
7 => shuffle!(7),
8 => shuffle!(8),
9 => shuffle!(9),
10 => shuffle!(10),
11 => shuffle!(11),
12 => shuffle!(12),
13 => shuffle!(13),
14 => shuffle!(14),
15 => shuffle!(15),
_ => shuffle!(16),
}
}
@ -174,7 +184,7 @@ pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpavgw))]
pub unsafe fn _mm256_avg_epu16 (a: u16x16, b: u16x16) -> u16x16 {
pub unsafe fn _mm256_avg_epu16(a: u16x16, b: u16x16) -> u16x16 {
pavgw(a, b)
}
@ -182,7 +192,7 @@ pub unsafe fn _mm256_avg_epu16 (a: u16x16, b: u16x16) -> u16x16 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpavgb))]
pub unsafe fn _mm256_avg_epu8 (a: u8x32, b: u8x32) -> u8x32 {
pub unsafe fn _mm256_avg_epu8(a: u8x32, b: u8x32) -> u8x32 {
pavgb(a, b)
}
@ -320,8 +330,8 @@ pub unsafe fn _mm256_blend_epi16(a: i16x16, b: i16x16, imm8: i32) -> i16x16 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpblendvb))]
pub unsafe fn _mm256_blendv_epi8(a:i8x32,b:i8x32,mask:__m256i) -> i8x32 {
pblendvb(a,b,mask)
pub unsafe fn _mm256_blendv_epi8(a: i8x32, b: i8x32, mask: __m256i) -> i8x32 {
pblendvb(a, b, mask)
}
/// Broadcast the low packed 8-bit integer from `a` to all elements of
@ -628,37 +638,83 @@ pub unsafe fn _mm256_hsubs_epi16(a: i16x16, b: i16x16) -> i16x16 {
}
// TODO _mm_i32gather_epi32 (int const* base_addr, __m128i vindex, const int scale)
// TODO _mm_mask_i32gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale)
// TODO _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale)
// TODO _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale)
// TODO _mm_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale)
// TODO _mm_mask_i32gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale)
// TODO _mm256_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale)
// TODO _mm256_mask_i32gather_epi64 (__m256i src, __int64 const* base_addr, __m128i vindex, __m256i mask, const int scale)
// TODO _mm_i32gather_pd (double const* base_addr, __m128i vindex, const int scale)
// TODO _mm_mask_i32gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale)
// TODO _mm256_i32gather_pd (double const* base_addr, __m128i vindex, const int scale)
// TODO _mm256_mask_i32gather_pd (__m256d src, double const* base_addr, __m128i vindex, __m256d mask, const int scale)
// TODO _mm_i32gather_ps (float const* base_addr, __m128i vindex, const int scale)
// TODO _mm_mask_i32gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale)
// TODO _mm256_i32gather_ps (float const* base_addr, __m256i vindex, const int scale)
// TODO _mm256_mask_i32gather_ps (__m256 src, float const* base_addr, __m256i vindex, __m256 mask, const int scale)
// TODO _mm_i64gather_epi32 (int const* base_addr, __m128i vindex, const int scale)
// TODO _mm_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale)
// TODO _mm256_i64gather_epi32 (int const* base_addr, __m256i vindex, const int scale)
// TODO _mm256_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m256i vindex, __m128i mask, const int scale)
// TODO _mm_i64gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale)
// TODO _mm_mask_i64gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale)
// TODO _mm256_i64gather_epi64 (__int64 const* base_addr, __m256i vindex, const int scale)
// TODO _mm256_mask_i64gather_epi64 (__m256i src, __int64 const* base_addr, __m256i vindex, __m256i mask, const int scale)
// TODO _mm_i64gather_pd (double const* base_addr, __m128i vindex, const int scale)
// TODO _mm_mask_i64gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale)
// TODO _mm256_i64gather_pd (double const* base_addr, __m256i vindex, const int scale)
// TODO _mm256_mask_i64gather_pd (__m256d src, double const* base_addr, __m256i vindex, __m256d mask, const int scale)
// TODO _mm_i64gather_ps (float const* base_addr, __m128i vindex, const int scale)
// TODO _mm_mask_i64gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale)
// TODO _mm256_i64gather_ps (float const* base_addr, __m256i vindex, const int scale)
// TODO _mm_i32gather_epi32 (int const* base_addr, __m128i vindex,
// const int scale)
// TODO _mm_mask_i32gather_epi32 (__m128i src, int const* base_addr,
// __m128i vindex, __m128i mask,
// const int scale)
// TODO _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex,
// const int scale)
// TODO _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr,
// __m256i vindex, __m256i mask,
// const int scale)
// TODO _mm_i32gather_epi64 (__int64 const* base_addr, __m128i vindex,
// const int scale)
// TODO _mm_mask_i32gather_epi64 (__m128i src, __int64 const* base_addr,
// __m128i vindex, __m128i mask,
// const int scale)
// TODO _mm256_i32gather_epi64 (__int64 const* base_addr, __m128i vindex,
// const int scale)
// TODO _mm256_mask_i32gather_epi64 (__m256i src, __int64 const* base_addr,
// __m128i vindex, __m256i mask,
// const int scale)
// TODO _mm_i32gather_pd (double const* base_addr, __m128i vindex,
// const int scale)
// TODO _mm_mask_i32gather_pd (__m128d src, double const* base_addr,
// __m128i vindex, __m128d mask,
// const int scale)
// TODO _mm256_i32gather_pd (double const* base_addr, __m128i vindex,
// const int scale)
// TODO _mm256_mask_i32gather_pd (__m256d src, double const* base_addr,
// __m128i vindex, __m256d mask,
// const int scale)
// TODO _mm_i32gather_ps (float const* base_addr, __m128i vindex,
// const int scale)
// TODO _mm_mask_i32gather_ps (__m128 src, float const* base_addr,
// __m128i vindex, __m128 mask,
// const int scale)
// TODO _mm256_i32gather_ps (float const* base_addr, __m256i vindex,
// const int scale)
// TODO _mm256_mask_i32gather_ps (__m256 src, float const* base_addr,
// __m256i vindex, __m256 mask,
// const int scale)
// TODO _mm_i64gather_epi32 (int const* base_addr, __m128i vindex,
// const int scale)
// TODO _mm_mask_i64gather_epi32 (__m128i src, int const* base_addr,
// __m128i vindex, __m128i mask,
// const int scale)
// TODO _mm256_i64gather_epi32 (int const* base_addr, __m256i vindex,
// const int scale)
// TODO _mm256_mask_i64gather_epi32 (__m128i src, int const* base_addr,
// __m256i vindex, __m128i mask,
// const int scale)
// TODO _mm_i64gather_epi64 (__int64 const* base_addr, __m128i vindex,
// const int scale)
// TODO _mm_mask_i64gather_epi64 (__m128i src, __int64 const* base_addr,
// __m128i vindex, __m128i mask,
// const int scale)
// TODO _mm256_i64gather_epi64 (__int64 const* base_addr, __m256i vindex,
// const int scale)
// TODO _mm256_mask_i64gather_epi64 (__m256i src, __int64 const* base_addr,
// __m256i vindex, __m256i mask,
// const int scale)
// TODO _mm_i64gather_pd (double const* base_addr, __m128i vindex,
// const int scale)
// TODO _mm_mask_i64gather_pd (__m128d src, double const* base_addr,
// __m128i vindex, __m128d mask,
// const int scale)
// TODO _mm256_i64gather_pd (double const* base_addr, __m256i vindex,
// const int scale)
// TODO _mm256_mask_i64gather_pd (__m256d src, double const* base_addr,
// __m256i vindex, __m256d mask,
// const int scale)
// TODO _mm_i64gather_ps (float const* base_addr, __m128i vindex,
// const int scale)
// TODO _mm_mask_i64gather_ps (__m128 src, float const* base_addr,
// __m128i vindex, __m128 mask,
// const int scale)
// TODO _mm256_i64gather_ps (float const* base_addr, __m256i vindex,
// const int scale)
// TODO _mm256_mask_i64gather_ps
// TODO _mm256_inserti128_si256
@ -946,7 +1002,7 @@ pub unsafe fn _mm256_mulhi_epu16(a: u16x16, b: u16x16) -> u16x16 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpmullw))]
pub unsafe fn _mm256_mullo_epi16(a: i16x16, b:i16x16) -> i16x16 {
pub unsafe fn _mm256_mullo_epi16(a: i16x16, b: i16x16) -> i16x16 {
a * b
}
@ -957,7 +1013,7 @@ pub unsafe fn _mm256_mullo_epi16(a: i16x16, b:i16x16) -> i16x16 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpmulld))]
pub unsafe fn _mm256_mullo_epi32(a: i32x8, b:i32x8) -> i32x8 {
pub unsafe fn _mm256_mullo_epi32(a: i32x8, b: i32x8) -> i32x8 {
a * b
}
@ -968,7 +1024,7 @@ pub unsafe fn _mm256_mullo_epi32(a: i32x8, b:i32x8) -> i32x8 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpmulhrsw))]
pub unsafe fn _mm256_mulhrs_epi16(a: i16x16, b:i16x16) -> i16x16 {
pub unsafe fn _mm256_mulhrs_epi16(a: i16x16, b: i16x16) -> i16x16 {
pmulhrsw(a, b)
}
@ -1088,7 +1144,7 @@ pub unsafe fn _mm256_permute4x64_epi64(a: i64x4, imm8: i32) -> i64x4 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsadbw))]
pub unsafe fn _mm256_sad_epu8 (a: u8x32, b: u8x32) -> u64x4 {
pub unsafe fn _mm256_sad_epu8(a: u8x32, b: u8x32) -> u64x4 {
psadbw(a, b)
}
@ -1580,15 +1636,19 @@ pub unsafe fn _mm256_subs_epu8(a: u8x32, b: u8x32) -> u8x32 {
/// use stdsimd::simd::i8x32;
/// use stdsimd::vendor::_mm256_unpackhi_epi8;
///
/// let a = i8x32::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
/// let b = i8x32::new(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31);
/// let a = i8x32::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
/// 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
/// let b = i8x32::new(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,
/// -16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31);
///
/// let c: i8x32;
/// unsafe {
/// c = _mm256_unpackhi_epi8(a, b);
/// }
///
/// let expected = i8x32::new(8,-8, 9,-9, 10,-10, 11,-11, 12,-12, 13,-13, 14,-14, 15,-15, 24,-24, 25,-25, 26,-26, 27,-27, 28,-28, 29,-29, 30,-30, 31,-31);
/// let expected = i8x32::new(8,-8, 9,-9, 10,-10, 11,-11, 12,-12, 13,-13,
/// 14,-14, 15,-15, 24,-24, 25,-25, 26,-26, 27,-27, 28,-28, 29,-29, 30,-30,
/// 31,-31);
/// assert_eq!(c, expected);
///
/// # }
@ -1600,7 +1660,13 @@ pub unsafe fn _mm256_subs_epu8(a: u8x32, b: u8x32) -> u8x32 {
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpunpckhbw))]
pub unsafe fn _mm256_unpackhi_epi8(a: i8x32, b: i8x32) -> i8x32 {
simd_shuffle32(a, b, [8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63])
#[cfg_attr(rustfmt, rustfmt_skip)]
simd_shuffle32(a, b, [
8, 40, 9, 41, 10, 42, 11, 43,
12, 44, 13, 45, 14, 46, 15, 47,
24, 56, 25, 57, 26, 58, 27, 59,
28, 60, 29, 61, 30, 62, 31, 63,
])
}
/// Unpack and interleave 8-bit integers from the low half of each
@ -1619,15 +1685,18 @@ pub unsafe fn _mm256_unpackhi_epi8(a: i8x32, b: i8x32) -> i8x32 {
/// use stdsimd::simd::i8x32;
/// use stdsimd::vendor::_mm256_unpacklo_epi8;
///
/// let a = i8x32::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
/// let b = i8x32::new(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31);
/// let a = i8x32::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
/// 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
/// let b = i8x32::new(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,
/// -16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31);
///
/// let c: i8x32;
/// unsafe {
/// c = _mm256_unpacklo_epi8(a, b);
/// }
///
/// let expected = i8x32::new(0, 0, 1,-1, 2,-2, 3,-3, 4,-4, 5,-5, 6,-6, 7,-7, 16,-16, 17,-17, 18,-18, 19,-19, 20,-20, 21,-21, 22,-22, 23,-23);
/// let expected = i8x32::new(0, 0, 1,-1, 2,-2, 3,-3, 4,-4, 5,-5, 6,-6, 7,-7,
/// 16,-16, 17,-17, 18,-18, 19,-19, 20,-20, 21,-21, 22,-22, 23,-23);
/// assert_eq!(c, expected);
///
/// # }
@ -1639,7 +1708,13 @@ pub unsafe fn _mm256_unpackhi_epi8(a: i8x32, b: i8x32) -> i8x32 {
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpunpcklbw))]
pub unsafe fn _mm256_unpacklo_epi8(a: i8x32, b: i8x32) -> i8x32 {
simd_shuffle32(a, b, [0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55])
#[cfg_attr(rustfmt, rustfmt_skip)]
simd_shuffle32(a, b, [
0, 32, 1, 33, 2, 34, 3, 35,
4, 36, 5, 37, 6, 38, 7, 39,
16, 48, 17, 49, 18, 50, 19, 51,
20, 52, 21, 53, 22, 54, 23, 55,
])
}
/// Unpack and interleave 16-bit integers from the high half of each
@ -1666,7 +1741,8 @@ pub unsafe fn _mm256_unpacklo_epi8(a: i8x32, b: i8x32) -> i8x32 {
/// c = _mm256_unpackhi_epi16(a, b);
/// }
///
/// let expected = i16x16::new(4,-4, 5,-5, 6,-6, 7,-7, 12,-12, 13,-13, 14,-14, 15,-15);
/// let expected = i16x16::new(4,-4, 5,-5, 6,-6, 7,-7, 12,-12, 13,-13, 14,-14,
/// 15,-15);
/// assert_eq!(c, expected);
///
/// # }
@ -1678,7 +1754,11 @@ pub unsafe fn _mm256_unpacklo_epi8(a: i8x32, b: i8x32) -> i8x32 {
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpunpckhwd))]
pub unsafe fn _mm256_unpackhi_epi16(a: i16x16, b: i16x16) -> i16x16 {
simd_shuffle16(a, b, [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31])
simd_shuffle16(
a,
b,
[4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
)
}
/// Unpack and interleave 16-bit integers from the low half of each
@ -1705,7 +1785,8 @@ pub unsafe fn _mm256_unpackhi_epi16(a: i16x16, b: i16x16) -> i16x16 {
/// c = _mm256_unpacklo_epi16(a, b);
/// }
///
/// let expected = i16x16::new(0, 0, 1,-1, 2,-2, 3,-3, 8,-8, 9,-9, 10,-10, 11,-11);
/// let expected = i16x16::new(0, 0, 1,-1, 2,-2, 3,-3, 8,-8, 9,-9, 10,-10,
/// 11,-11);
/// assert_eq!(c, expected);
///
/// # }
@ -1717,7 +1798,11 @@ pub unsafe fn _mm256_unpackhi_epi16(a: i16x16, b: i16x16) -> i16x16 {
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpunpcklwd))]
pub unsafe fn _mm256_unpacklo_epi16(a: i16x16, b: i16x16) -> i16x16 {
simd_shuffle16(a, b, [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27])
simd_shuffle16(
a,
b,
[0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
)
}
/// Unpack and interleave 32-bit integers from the high half of each
@ -1972,9 +2057,9 @@ extern "C" {
#[link_name = "llvm.x86.avx2.pmulh.w"]
fn pmulhw(a: i16x16, b: i16x16) -> i16x16;
#[link_name = "llvm.x86.avx2.pmul.dq"]
fn pmuldq(a: i32x8, b:i32x8) -> i64x4;
fn pmuldq(a: i32x8, b: i32x8) -> i64x4;
#[link_name = "llvm.x86.avx2.pmulu.dq"]
fn pmuludq(a: u32x8, b:u32x8) -> u64x4;
fn pmuludq(a: u32x8, b: u32x8) -> u64x4;
#[link_name = "llvm.x86.avx2.pmul.hr.sw"]
fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
#[link_name = "llvm.x86.avx2.packsswb"]
@ -2006,17 +2091,17 @@ extern "C" {
#[link_name = "llvm.x86.avx2.pslli.q"]
fn pslliq(a: i64x4, imm8: i32) -> i64x4;
#[link_name = "llvm.x86.avx2.psllv.d"]
fn psllvd(a:i32x4, count:i32x4) -> i32x4;
fn psllvd(a: i32x4, count: i32x4) -> i32x4;
#[link_name = "llvm.x86.avx2.psllv.d.256"]
fn psllvd256(a:i32x8, count:i32x8) -> i32x8;
fn psllvd256(a: i32x8, count: i32x8) -> i32x8;
#[link_name = "llvm.x86.avx2.psllv.q"]
fn psllvq(a:i64x2, count:i64x2) -> i64x2;
fn psllvq(a: i64x2, count: i64x2) -> i64x2;
#[link_name = "llvm.x86.avx2.psllv.q.256"]
fn psllvq256(a:i64x4, count:i64x4) -> i64x4;
fn psllvq256(a: i64x4, count: i64x4) -> i64x4;
#[link_name = "llvm.x86.avx2.psra.w"]
fn psraw(a: i16x16, count:i16x8) -> i16x16;
fn psraw(a: i16x16, count: i16x8) -> i16x16;
#[link_name = "llvm.x86.avx2.psra.d"]
fn psrad(a: i32x8, count:i32x4) -> i32x8;
fn psrad(a: i32x8, count: i32x4) -> i32x8;
#[link_name = "llvm.x86.avx2.psrai.w"]
fn psraiw(a: i16x16, imm8: i32) -> i16x16;
#[link_name = "llvm.x86.avx2.psrai.d"]
@ -2026,11 +2111,11 @@ extern "C" {
#[link_name = "llvm.x86.avx2.psrav.d.256"]
fn psravd256(a: i32x8, count: i32x8) -> i32x8;
#[link_name = "llvm.x86.avx2.psrl.w"]
fn psrlw(a: i16x16, count:i16x8) -> i16x16;
fn psrlw(a: i16x16, count: i16x8) -> i16x16;
#[link_name = "llvm.x86.avx2.psrl.d"]
fn psrld(a: i32x8, count:i32x4) -> i32x8;
fn psrld(a: i32x8, count: i32x4) -> i32x8;
#[link_name = "llvm.x86.avx2.psrl.q"]
fn psrlq(a: i64x4, count:i64x2) -> i64x4;
fn psrlq(a: i64x4, count: i64x2) -> i64x4;
#[link_name = "llvm.x86.avx2.psrli.w"]
fn psrliw(a: i16x16, imm8: i32) -> i16x16;
#[link_name = "llvm.x86.avx2.psrli.d"]
@ -2071,49 +2156,53 @@ mod tests {
#[simd_test = "avx2"]
unsafe fn _mm256_abs_epi32() {
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = i32x8::new(
0, 1, -1, std::i32::MAX,
std::i32::MIN + 1, 100, -100, -32);
std::i32::MIN + 1, 100, -100, -32,
);
let r = avx2::_mm256_abs_epi32(a);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = i32x8::new(
0, 1, 1, std::i32::MAX,
(std::i32::MIN + 1).abs(), 100, 100, 32);
(std::i32::MIN + 1).abs(), 100, 100, 32,
);
assert_eq!(r, e);
}
#[simd_test = "avx2"]
unsafe fn _mm256_abs_epi16() {
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = i16x16::new(
0, 1, -1, 2,
-2, 3, -3, 4,
-4, 5, -5, std::i16::MAX,
std::i16::MIN + 1, 100, -100, -32);
0, 1, -1, 2, -2, 3, -3, 4,
-4, 5, -5, std::i16::MAX, std::i16::MIN + 1, 100, -100, -32,
);
let r = avx2::_mm256_abs_epi16(a);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = i16x16::new(
0, 1, 1, 2,
2, 3, 3, 4,
4, 5, 5, std::i16::MAX,
(std::i16::MIN + 1).abs(), 100, 100, 32);
0, 1, 1, 2, 2, 3, 3, 4,
4, 5, 5, std::i16::MAX, (std::i16::MIN + 1).abs(), 100, 100, 32,
);
assert_eq!(r, e);
}
#[simd_test = "avx2"]
unsafe fn _mm256_abs_epi8() {
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = i8x32::new(
0, 1, -1, 2,
-2, 3, -3, 4,
-4, 5, -5, std::i8::MAX,
std::i8::MIN + 1, 100, -100, -32,
0, 1, -1, 2,
-2, 3, -3, 4,
-4, 5, -5, std::i8::MAX,
std::i8::MIN + 1, 100, -100, -32);
0, 1, -1, 2, -2, 3, -3, 4,
-4, 5, -5, std::i8::MAX, std::i8::MIN + 1, 100, -100, -32,
0, 1, -1, 2, -2, 3, -3, 4,
-4, 5, -5, std::i8::MAX, std::i8::MIN + 1, 100, -100, -32,
);
let r = avx2::_mm256_abs_epi8(a);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = i8x32::new(
0, 1, 1, 2, 2, 3, 3, 4,
4, 5, 5, std::i8::MAX, (std::i8::MIN + 1).abs(), 100, 100, 32,
0, 1, 1, 2, 2, 3, 3, 4,
4, 5, 5, std::i8::MAX, (std::i8::MIN + 1).abs(), 100, 100, 32);
4, 5, 5, std::i8::MAX, (std::i8::MIN + 1).abs(), 100, 100, 32,
);
assert_eq!(r, e);
}
@ -2137,52 +2226,70 @@ mod tests {
#[simd_test = "avx2"]
unsafe fn _mm256_add_epi16() {
let a = i16x16::new(
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15);
let b = i16x16::new(
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15);
let a =
i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let b =
i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let r = avx2::_mm256_add_epi16(a, b);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = i16x16::new(
0, 2, 4, 6, 8, 10, 12, 14,
16, 18, 20, 22, 24, 26, 28, 30);
16, 18, 20, 22, 24, 26, 28, 30,
);
assert_eq!(r, e);
}
#[simd_test = "avx2"]
unsafe fn _mm256_add_epi8() {
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = i8x32::new(
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31);
24, 25, 26, 27, 28, 29, 30, 31,
);
#[cfg_attr(rustfmt, rustfmt_skip)]
let b = i8x32::new(
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31);
24, 25, 26, 27, 28, 29, 30, 31,
);
let r = avx2::_mm256_add_epi8(a, b);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = i8x32::new(
0, 2, 4, 6, 8, 10, 12, 14, 16,
18, 20, 22, 24, 26, 28, 30, 32,
34, 36, 38, 40, 42, 44, 46, 48,
50, 52, 54, 56, 58, 60, 62);
0, 2, 4, 6, 8, 10, 12, 14,
16, 18, 20, 22, 24, 26, 28, 30,
32, 34, 36, 38, 40, 42, 44, 46,
48, 50, 52, 54, 56, 58, 60, 62,
);
assert_eq!(r, e);
}
#[simd_test = "avx2"]
unsafe fn _mm256_adds_epi8() {
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = i8x32::new(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31,
);
#[cfg_attr(rustfmt, rustfmt_skip)]
let b = i8x32::new(
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
32, 33, 34, 35, 36, 37, 38, 39,
40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55,
56, 57, 58, 59, 60, 61, 62, 63,
);
let r = avx2::_mm256_adds_epi8(a, b);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = i8x32::new(
32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94);
32, 34, 36, 38, 40, 42, 44, 46,
48, 50, 52, 54, 56, 58, 60, 62,
64, 66, 68, 70, 72, 74, 76, 78,
80, 82, 84, 86, 88, 90, 92, 94,
);
assert_eq!(r, e);
}
@ -2204,13 +2311,19 @@ mod tests {
#[simd_test = "avx2"]
unsafe fn _mm256_adds_epi16() {
let a = i16x16::new(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let a =
i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
#[cfg_attr(rustfmt, rustfmt_skip)]
let b = i16x16::new(
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47);
let r = avx2::_mm256_adds_epi16(a, b);
32, 33, 34, 35, 36, 37, 38, 39,
40, 41, 42, 43, 44, 45, 46, 47,
);
let r = avx2::_mm256_adds_epi16(a, b);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = i16x16::new(
32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62);
32, 34, 36, 38, 40, 42, 44, 46,
48, 50, 52, 54, 56, 58, 60, 62,
);
assert_eq!(r, e);
}
@ -2233,16 +2346,28 @@ mod tests {
#[simd_test = "avx2"]
unsafe fn _mm256_adds_epu8() {
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = u8x32::new(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31,
);
#[cfg_attr(rustfmt, rustfmt_skip)]
let b = u8x32::new(
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
32, 33, 34, 35, 36, 37, 38, 39,
40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55,
56, 57, 58, 59, 60, 61, 62, 63,
);
let r = avx2::_mm256_adds_epu8(a, b);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = u8x32::new(
32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94);
32, 34, 36, 38, 40, 42, 44, 46,
48, 50, 52, 54, 56, 58, 60, 62,
64, 66, 68, 70, 72, 74, 76, 78,
80, 82, 84, 86, 88, 90, 92, 94,
);
assert_eq!(r, e);
}
@ -2257,13 +2382,19 @@ mod tests {
#[simd_test = "avx2"]
unsafe fn _mm256_adds_epu16() {
let a = u16x16::new(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let a =
u16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
#[cfg_attr(rustfmt, rustfmt_skip)]
let b = u16x16::new(
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47);
32, 33, 34, 35, 36, 37, 38, 39,
40, 41, 42, 43, 44, 45, 46, 47,
);
let r = avx2::_mm256_adds_epu16(a, b);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = u16x16::new(
32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62);
32, 34, 36, 38, 40, 42, 44, 46,
48, 50, 52, 54, 56, 58, 60, 62,
);
assert_eq!(r, e);
}
@ -2346,11 +2477,11 @@ mod tests {
#[simd_test = "avx2"]
unsafe fn _mm256_blendv_epi8() {
let (a,b) = (i8x32::splat(4),i8x32::splat(2));
let mask = i8x32::splat(0).replace(2,-1);
let e = i8x32::splat(4).replace(2,2);
let r= avx2::_mm256_blendv_epi8(a,b,mask);
assert_eq!(r,e);
let (a, b) = (i8x32::splat(4), i8x32::splat(2));
let mask = i8x32::splat(0).replace(2, -1);
let e = i8x32::splat(4).replace(2, 2);
let r = avx2::_mm256_blendv_epi8(a, b, mask);
assert_eq!(r, e);
}
#[simd_test = "avx2"]
@ -2413,8 +2544,12 @@ mod tests {
unsafe fn _mm256_broadcastsi128_si256() {
let a = i64x2::new(0x0987654321012334, 0x5678909876543210);
let res = avx2::_mm256_broadcastsi128_si256(a);
let retval = i64x4::new(0x0987654321012334, 0x5678909876543210,
0x0987654321012334, 0x5678909876543210);
let retval = i64x4::new(
0x0987654321012334,
0x5678909876543210,
0x0987654321012334,
0x5678909876543210,
);
assert_eq!(res, retval);
}
@ -2448,30 +2583,38 @@ mod tests {
#[simd_test = "avx2"]
unsafe fn _mm256_cmpeq_epi8() {
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = i8x32::new(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31,
);
#[cfg_attr(rustfmt, rustfmt_skip)]
let b = i8x32::new(
31, 30, 2, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
31, 30, 2, 28, 27, 26, 25, 24,
23, 22, 21, 20, 19, 18, 17, 16,
15, 14, 13, 12, 11, 10, 9, 8,
7, 6, 5, 4, 3, 2, 1, 0,
);
let r = avx2::_mm256_cmpeq_epi8(a, b);
assert_eq!(r, i8x32::splat(0).replace(2,0xFFu8 as i8));
assert_eq!(r, i8x32::splat(0).replace(2, 0xFFu8 as i8));
}
#[simd_test = "avx2"]
unsafe fn _mm256_cmpeq_epi16() {
let a = i16x16::new(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let b = i16x16::new(
15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
let a =
i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let b =
i16x16::new(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
let r = avx2::_mm256_cmpeq_epi16(a, b);
assert_eq!(r, i16x16::splat(0).replace(2, 0xFFFFu16 as i16));
}
#[simd_test = "avx2"]
unsafe fn _mm256_cmpeq_epi32() {
let a = i32x8::new(0, 1, 2, 3,4,5,6,7);
let b = i32x8::new(7,6,2,4,3, 2, 1, 0);
let a = i32x8::new(0, 1, 2, 3, 4, 5, 6, 7);
let b = i32x8::new(7, 6, 2, 4, 3, 2, 1, 0);
let r = avx2::_mm256_cmpeq_epi32(a, b);
assert_eq!(r, i32x8::splat(0).replace(2, 0xFFFFFFFFu32 as i32));
}
@ -2481,8 +2624,10 @@ mod tests {
let a = i64x4::new(0, 1, 2, 3);
let b = i64x4::new(3, 2, 2, 0);
let r = avx2::_mm256_cmpeq_epi64(a, b);
assert_eq!(r, i64x4::splat(0).replace(
2, 0xFFFFFFFFFFFFFFFFu64 as i64));
assert_eq!(
r,
i64x4::splat(0).replace(2, 0xFFFFFFFFFFFFFFFFu64 as i64)
);
}
#[simd_test = "avx2"]
@ -2514,27 +2659,33 @@ mod tests {
let a = i64x4::splat(0).replace(0, 5);
let b = i64x4::splat(0);
let r = avx2::_mm256_cmpgt_epi64(a, b);
assert_eq!(r, i64x4::splat(0).replace(
0, 0xFFFFFFFFFFFFFFFFu64 as i64));
assert_eq!(
r,
i64x4::splat(0).replace(0, 0xFFFFFFFFFFFFFFFFu64 as i64)
);
}
#[simd_test = "avx2"]
unsafe fn _mm256_cvtepi8_epi16() {
let a = i8x16::new(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7);
let r = i16x16::new(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7);
let a =
i8x16::new(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7);
let r =
i16x16::new(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7);
assert_eq!(r, avx2::_mm256_cvtepi8_epi16(a));
}
#[simd_test = "avx2"]
unsafe fn _mm256_cvtepi8_epi32() {
let a = i8x16::new(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7);
let a =
i8x16::new(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7);
let r = i32x8::new(0, 0, -1, 1, -2, 2, -3, 3);
assert_eq!(r, avx2::_mm256_cvtepi8_epi32(a));
}
#[simd_test = "avx2"]
unsafe fn _mm256_cvtepi8_epi64() {
let a = i8x16::new(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7);
let a =
i8x16::new(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7);
let r = i64x4::new(0, 0, -1, 1);
assert_eq!(r, avx2::_mm256_cvtepi8_epi64(a));
}
@ -2580,11 +2731,11 @@ mod tests {
#[simd_test = "avx2"]
unsafe fn _mm256_hadds_epi16() {
let a = i16x16::splat(2).replace(0,0x7FFF).replace(1,1);
let a = i16x16::splat(2).replace(0, 0x7FFF).replace(1, 1);
let b = i16x16::splat(4);
let r = avx2::_mm256_hadds_epi16(a, b);
let e = i16x16::new(
0x7FFF, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
let e =
i16x16::new(0x7FFF, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
assert_eq!(r, e);
}
@ -2608,10 +2759,10 @@ mod tests {
#[simd_test = "avx2"]
unsafe fn _mm256_hsubs_epi16() {
let a = i16x16::splat(2).replace(0,0x7FFF).replace(1,-1);
let a = i16x16::splat(2).replace(0, 0x7FFF).replace(1, -1);
let b = i16x16::splat(4);
let r = avx2::_mm256_hsubs_epi16(a, b);
let e = i16x16::splat(0).replace(0,0x7FFF);
let e = i16x16::splat(0).replace(0, 0x7FFF);
assert_eq!(r, e);
}
@ -2902,11 +3053,13 @@ mod tests {
let a = i16x16::splat(2);
let b = i16x16::splat(4);
let r = avx2::_mm256_packs_epi16(a, b);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = i8x32::new(
2, 2, 2, 2, 2, 2, 2, 2,
4, 4, 4, 4, 4, 4, 4, 4,
2, 2, 2, 2, 2, 2, 2, 2,
4, 4, 4, 4, 4, 4, 4, 4);
4, 4, 4, 4, 4, 4, 4, 4,
);
assert_eq!(r, e);
}
@ -2916,11 +3069,7 @@ mod tests {
let a = i32x8::splat(2);
let b = i32x8::splat(4);
let r = avx2::_mm256_packs_epi32(a, b);
let e = i16x16::new(
2, 2, 2, 2,
4, 4, 4, 4,
2, 2, 2, 2,
4, 4, 4, 4);
let e = i16x16::new(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
assert_eq!(r, e);
}
@ -2930,11 +3079,13 @@ mod tests {
let a = i16x16::splat(2);
let b = i16x16::splat(4);
let r = avx2::_mm256_packus_epi16(a, b);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = u8x32::new(
2, 2, 2, 2, 2, 2, 2, 2,
4, 4, 4, 4, 4, 4, 4, 4,
2, 2, 2, 2, 2, 2, 2, 2,
4, 4, 4, 4, 4, 4, 4, 4);
4, 4, 4, 4, 4, 4, 4, 4,
);
assert_eq!(r, e);
}
@ -2944,11 +3095,7 @@ mod tests {
let a = i32x8::splat(2);
let b = i32x8::splat(4);
let r = avx2::_mm256_packus_epi32(a, b);
let e = u16x16::new(
2, 2, 2, 2,
4, 4, 4, 4,
2, 2, 2, 2,
4, 4, 4, 4);
let e = u16x16::new(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
assert_eq!(r, e);
}
@ -3017,21 +3164,24 @@ mod tests {
unsafe fn _mm256_slli_epi16() {
assert_eq!(
avx2::_mm256_slli_epi16(i16x16::splat(0xFF), 4),
i16x16::splat(0xFF0));
i16x16::splat(0xFF0)
);
}
#[simd_test = "avx2"]
unsafe fn _mm256_slli_epi32() {
assert_eq!(
avx2::_mm256_slli_epi32(i32x8::splat(0xFFFF), 4),
i32x8::splat(0xFFFF0));
i32x8::splat(0xFFFF0)
);
}
#[simd_test = "avx2"]
unsafe fn _mm256_slli_epi64() {
assert_eq!(
avx2::_mm256_slli_epi64(i64x4::splat(0xFFFFFFFF), 4),
i64x4::splat(0xFFFFFFFF0));
i64x4::splat(0xFFFFFFFF0)
);
}
#[simd_test = "avx2"]
@ -3090,14 +3240,16 @@ mod tests {
unsafe fn _mm256_srai_epi16() {
assert_eq!(
avx2::_mm256_srai_epi16(i16x16::splat(-1), 1),
i16x16::splat(-1));
i16x16::splat(-1)
);
}
#[simd_test = "avx2"]
unsafe fn _mm256_srai_epi32() {
assert_eq!(
avx2::_mm256_srai_epi32(i32x8::splat(-1), 1),
i32x8::splat(-1));
i32x8::splat(-1)
);
}
#[simd_test = "avx2"]
@ -3106,7 +3258,7 @@ mod tests {
let count = i32x4::splat(1);
let r = avx2::_mm_srav_epi32(a, count);
let e = i32x4::splat(2);
assert_eq!(r, e );
assert_eq!(r, e);
}
#[simd_test = "avx2"]
@ -3115,7 +3267,7 @@ mod tests {
let count = i32x8::splat(1);
let r = avx2::_mm256_srav_epi32(a, count);
let e = i32x8::splat(2);
assert_eq!(r, e );
assert_eq!(r, e);
}
#[simd_test = "avx2"]
@ -3146,21 +3298,24 @@ mod tests {
unsafe fn _mm256_srli_epi16() {
assert_eq!(
avx2::_mm256_srli_epi16(i16x16::splat(0xFF), 4),
i16x16::splat(0xF));
i16x16::splat(0xF)
);
}
#[simd_test = "avx2"]
unsafe fn _mm256_srli_epi32() {
assert_eq!(
avx2::_mm256_srli_epi32(i32x8::splat(0xFFFF), 4),
i32x8::splat(0xFFF));
i32x8::splat(0xFFF)
);
}
#[simd_test = "avx2"]
unsafe fn _mm256_srli_epi64() {
assert_eq!(
avx2::_mm256_srli_epi64(i64x4::splat(0xFFFFFFFF), 4),
i64x4::splat(0xFFFFFFF));
i64x4::splat(0xFFFFFFF)
);
}
#[simd_test = "avx2"]
@ -3274,41 +3429,51 @@ mod tests {
#[simd_test = "avx2"]
unsafe fn _mm256_alignr_epi8() {
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = i8x32::new(
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32);
25, 26, 27, 28, 29, 30, 31, 32,
);
#[cfg_attr(rustfmt, rustfmt_skip)]
let b = i8x32::new(
-1, -2, -3, -4, -5, -6, -7, -8,
-9, -10, -11, -12, -13, -14, -15, -16,
-17, -18, -19, -20, -21, -22, -23, -24,
-25, -26, -27, -28, -29, -30, -31, -32);
-25, -26, -27, -28, -29, -30, -31, -32,
);
let r = avx2::_mm256_alignr_epi8(a, b, 33);
assert_eq!(r, i8x32::splat(0));
let r = avx2::_mm256_alignr_epi8(a, b, 17);
#[cfg_attr(rustfmt, rustfmt_skip)]
let expected = i8x32::new(
2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25,
26, 27, 28, 29, 30, 31, 32, 0);
26, 27, 28, 29, 30, 31, 32, 0,
);
assert_eq!(r, expected);
#[cfg_attr(rustfmt, rustfmt_skip)]
let expected = i8x32::new(
-17, -18, -19, -20, -21, -22, -23, -24,
-25, -26, -27, -28, -29, -30, -31, -32,
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16);
9, 10, 11, 12, 13, 14, 15, 16,
);
let r = avx2::_mm256_alignr_epi8(a, b, 16);
assert_eq!(r, expected);
let r = avx2::_mm256_alignr_epi8(a, b, 15);
#[cfg_attr(rustfmt, rustfmt_skip)]
let expected = i8x32::new(
-16, -17, -18, -19, -20, -21, -22, -23,
-24, -25, -26, -27, -28, -29, -30, -31,
-32, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15);
8, 9, 10, 11, 12, 13, 14, 15,
);
assert_eq!(r, expected);
let r = avx2::_mm256_alignr_epi8(a, b, 0);
@ -3317,18 +3482,21 @@ mod tests {
#[simd_test = "avx2"]
unsafe fn _mm256_shuffle_epi8() {
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = u8x32::new(
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32
25, 26, 27, 28, 29, 30, 31, 32,
);
#[cfg_attr(rustfmt, rustfmt_skip)]
let b = u8x32::new(
4, 128, 4, 3, 24, 12, 6, 19,
12, 5, 5, 10, 4, 1, 8, 0,
4, 128, 4, 3, 24, 12, 6, 19,
12, 5, 5, 10, 4, 1, 8, 0,
);
#[cfg_attr(rustfmt, rustfmt_skip)]
let expected = u8x32::new(
5, 0, 5, 4, 9, 13, 7, 4,
13, 6, 6, 11, 5, 2, 9, 1,

View file

@ -1,11 +1,16 @@
//! Bit Manipulation Instruction (BMI) Set 1.0.
//!
//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
//! Manual Volume 2: Instruction Set Reference,
//! A-Z](http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf).
//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
//!
//! [Wikipedia](https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#BMI1_.28Bit_Manipulation_Instruction_Set_1.29)
//! provides a quick overview of the available instructions.
//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
//! available.
//!
//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
//! [wikipedia_bmi]:
//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.
//! 28Advanced_Bit_Manipulation.29
#[cfg(test)]
use stdsimd_test::assert_instr;
@ -32,8 +37,8 @@ pub unsafe fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 {
/// Extracts bits of `a` specified by `control` into
/// the least significant bits of the result.
///
/// Bits [7,0] of `control` specify the index to the first bit in the range to be
/// extracted, and bits [15,8] specify the length of the range.
/// Bits [7,0] of `control` specify the index to the first bit in the range to
/// be extracted, and bits [15,8] specify the length of the range.
#[inline(always)]
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(bextr))]
@ -44,8 +49,8 @@ pub unsafe fn _bextr2_u32(a: u32, control: u32) -> u32 {
/// Extracts bits of `a` specified by `control` into
/// the least significant bits of the result.
///
/// Bits [7,0] of `control` specify the index to the first bit in the range to be
/// extracted, and bits [15,8] specify the length of the range.
/// Bits [7,0] of `control` specify the index to the first bit in the range to
/// be extracted, and bits [15,8] specify the length of the range.
#[inline(always)]
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(bextr))]
@ -177,9 +182,9 @@ pub unsafe fn _mm_tzcnt_u64(x: u64) -> u64 {
#[allow(dead_code)]
extern "C" {
#[link_name="llvm.x86.bmi.bextr.32"]
#[link_name = "llvm.x86.bmi.bextr.32"]
fn x86_bmi_bextr_32(x: u32, y: u32) -> u32;
#[link_name="llvm.x86.bmi.bextr.64"]
#[link_name = "llvm.x86.bmi.bextr.64"]
fn x86_bmi_bextr_64(x: u64, y: u64) -> u64;
}

View file

@ -1,11 +1,15 @@
//! Bit Manipulation Instruction (BMI) Set 2.0.
//!
//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
//! Manual Volume 2: Instruction Set Reference,
//! A-Z](http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectu res-software-developer-instruction-set-reference-manual-325383.pdf).
//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
//!
//! [Wikipedia](https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#BMI2_.28Bit_Manipulation_Instruction_Set_2.29)
//! provides a quick overview of the available instructions.
//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
//! available.
//!
//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
//! [wikipedia_bmi]:
//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.
//! 28Advanced_Bit_Manipulation.29
#[cfg(test)]
use stdsimd_test::assert_instr;
@ -96,17 +100,17 @@ pub unsafe fn _pext_u64(a: u64, mask: u64) -> u64 {
#[allow(dead_code)]
extern "C" {
#[link_name="llvm.x86.bmi.bzhi.32"]
#[link_name = "llvm.x86.bmi.bzhi.32"]
fn x86_bmi2_bzhi_32(x: u32, y: u32) -> u32;
#[link_name="llvm.x86.bmi.bzhi.64"]
#[link_name = "llvm.x86.bmi.bzhi.64"]
fn x86_bmi2_bzhi_64(x: u64, y: u64) -> u64;
#[link_name="llvm.x86.bmi.pdep.32"]
#[link_name = "llvm.x86.bmi.pdep.32"]
fn x86_bmi2_pdep_32(x: u32, y: u32) -> u32;
#[link_name="llvm.x86.bmi.pdep.64"]
#[link_name = "llvm.x86.bmi.pdep.64"]
fn x86_bmi2_pdep_64(x: u64, y: u64) -> u64;
#[link_name="llvm.x86.bmi.pext.32"]
#[link_name = "llvm.x86.bmi.pext.32"]
fn x86_bmi2_pext_32(x: u32, y: u32) -> u32;
#[link_name="llvm.x86.bmi.pext.64"]
#[link_name = "llvm.x86.bmi.pext.64"]
fn x86_bmi2_pext_64(x: u64, y: u64) -> u64;
}
@ -118,7 +122,7 @@ mod tests {
#[simd_test = "bmi2"]
unsafe fn _pext_u32() {
let n = 0b1011_1110_1001_0011u32;
let n = 0b1011_1110_1001_0011u32;
let m0 = 0b0110_0011_1000_0101u32;
let s0 = 0b0000_0000_0011_0101u32;
@ -133,7 +137,7 @@ mod tests {
#[simd_test = "bmi2"]
#[cfg(not(target_arch = "x86"))]
unsafe fn _pext_u64() {
let n = 0b1011_1110_1001_0011u64;
let n = 0b1011_1110_1001_0011u64;
let m0 = 0b0110_0011_1000_0101u64;
let s0 = 0b0000_0000_0011_0101u64;
@ -147,7 +151,7 @@ mod tests {
#[simd_test = "bmi2"]
unsafe fn _pdep_u32() {
let n = 0b1011_1110_1001_0011u32;
let n = 0b1011_1110_1001_0011u32;
let m0 = 0b0110_0011_1000_0101u32;
let s0 = 0b0000_0010_0000_0101u32;
@ -162,7 +166,7 @@ mod tests {
#[simd_test = "bmi2"]
#[cfg(not(target_arch = "x86"))]
unsafe fn _pdep_u64() {
let n = 0b1011_1110_1001_0011u64;
let n = 0b1011_1110_1001_0011u64;
let m0 = 0b0110_0011_1000_0101u64;
let s0 = 0b0000_0010_0000_0101u64;
@ -194,23 +198,31 @@ mod tests {
let a: u32 = 4_294_967_200;
let b: u32 = 2;
let (lo, hi): (u32, u32) = bmi2::_mulx_u32(a, b);
// result = 8589934400
// = 0b0001_1111_1111_1111_1111_1111_1111_0100_0000u64
// ^~hi ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/*
result = 8589934400
= 0b0001_1111_1111_1111_1111_1111_1111_0100_0000u64
^~hi ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/
assert_eq!(lo, 0b1111_1111_1111_1111_1111_1111_0100_0000u32);
assert_eq!(hi, 0b0001u32);
}
#[simd_test = "bmi2"]
#[cfg(not(target_arch = "x86"))]
#[cfg_attr(rustfmt, rustfmt_skip)]
unsafe fn _mulx_u64() {
let a: u64 = 9_223_372_036_854_775_800;
let b: u64 = 100;
let (lo, hi): (u64, u64) = bmi2::_mulx_u64(a, b);
// result = 922337203685477580000
// = 0b00110001_11111111_11111111_11111111_11111111_11111111_11111111_11111100_11100000u128
// ^~hi~~~~ ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
assert_eq!(lo, 0b11111111_11111111_11111111_11111111_11111111_11111111_11111100_11100000u64);
/*
result = 922337203685477580000 =
0b00110001_1111111111111111_1111111111111111_1111111111111111_1111110011100000
^~hi~~~~ ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/
assert_eq!(
lo,
0b11111111_11111111_11111111_11111111_11111111_11111111_11111100_11100000u64
);
assert_eq!(hi, 0b00110001u64);
}
}

View file

@ -348,5 +348,3 @@ macro_rules! assert_approx_eq {
*a, *b, $eps, (*a - *b).abs());
})
}

View file

@ -12,7 +12,7 @@ pub use self::bmi::*;
pub use self::bmi2::*;
pub use self::tbm::*;
pub use self::runtime::{__Feature, __unstable_detect_feature};
pub use self::runtime::{__unstable_detect_feature, __Feature};
#[allow(non_camel_case_types)]
pub type __m128i = ::v128::i8x16;

View file

@ -1,9 +1,9 @@
//! This module implements minimal run-time feature detection for x86.
//!
//! The features are detected using the `detect_features` function below. This function
//! uses the CPUID instruction to read the feature flags from the CPU and encodes them in
//! an `usize` where each bit position represents whether a feature is available (bit is set)
//! or unavaiable (bit is cleared).
//! The features are detected using the `detect_features` function below.
//! This function uses the CPUID instruction to read the feature flags from the
//! CPU and encodes them in an `usize` where each bit position represents
//! whether a feature is available (bit is set) or unavaiable (bit is cleared).
//!
//! The enum `__Feature` is used to map bit positions to feature names, and the
//! the `__unstable_detect_feature!` macro is used to map string literals (e.g.
@ -12,10 +12,10 @@
//!
//! The run-time feature detection is performed by the
//! `__unstable_detect_feature(__Feature) -> bool` function. On its first call,
//! this functions queries the CPU for the available features and stores them in
//! a global `AtomicUsize` variable. The query is performed by just checking whether the
//! feature bit in this global variable is set or cleared.
use ::std::sync::atomic::{AtomicUsize, Ordering};
//! this functions queries the CPU for the available features and stores them
//! in a global `AtomicUsize` variable. The query is performed by just checking
//! whether the feature bit in this global variable is set or cleared.
use std::sync::atomic::{AtomicUsize, Ordering};
/// This macro maps the string-literal feature names to values of the
/// `__Feature` enum at compile-time. The feature names used are the same as
@ -26,22 +26,68 @@ use ::std::sync::atomic::{AtomicUsize, Ordering};
#[macro_export]
#[doc(hidden)]
macro_rules! __unstable_detect_feature {
("sse") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::sse{}) };
("sse2") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::sse2{}) };
("sse3") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::sse3{}) };
("ssse3") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::ssse3{}) };
("sse4.1") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::sse4_1{}) };
("sse4.2") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::sse4_2{}) };
("avx") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::avx{}) };
("avx2") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::avx2{}) };
("fma") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::fma{}) };
("bmi") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::bmi{}) };
("bmi2") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::bmi2{}) };
("abm") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::abm{}) };
("lzcnt") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::abm{}) };
("tbm") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::tbm{}) };
("popcnt") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::popcnt{}) };
($t:tt) => { compile_error!(concat!("unknown target feature: ", $t)) };
("sse") => {
$crate::vendor::__unstable_detect_feature(
$crate::vendor::__Feature::sse{}) };
("sse2") => {
$crate::vendor::__unstable_detect_feature(
$crate::vendor::__Feature::sse2{})
};
("sse3") => {
$crate::vendor::__unstable_detect_feature(
$crate::vendor::__Feature::sse3{})
};
("ssse3") => {
$crate::vendor::__unstable_detect_feature(
$crate::vendor::__Feature::ssse3{})
};
("sse4.1") => {
$crate::vendor::__unstable_detect_feature(
$crate::vendor::__Feature::sse4_1{})
};
("sse4.2") => {
$crate::vendor::__unstable_detect_feature(
$crate::vendor::__Feature::sse4_2{})
};
("avx") => {
$crate::vendor::__unstable_detect_feature(
$crate::vendor::__Feature::avx{})
};
("avx2") => {
$crate::vendor::__unstable_detect_feature(
$crate::vendor::__Feature::avx2{})
};
("fma") => {
$crate::vendor::__unstable_detect_feature(
$crate::vendor::__Feature::fma{})
};
("bmi") => {
$crate::vendor::__unstable_detect_feature(
$crate::vendor::__Feature::bmi{})
};
("bmi2") => {
$crate::vendor::__unstable_detect_feature(
$crate::vendor::__Feature::bmi2{})
};
("abm") => {
$crate::vendor::__unstable_detect_feature(
$crate::vendor::__Feature::abm{})
};
("lzcnt") => {
$crate::vendor::__unstable_detect_feature(
$crate::vendor::__Feature::abm{})
};
("tbm") => {
$crate::vendor::__unstable_detect_feature(
$crate::vendor::__Feature::tbm{})
};
("popcnt") => {
$crate::vendor::__unstable_detect_feature(
$crate::vendor::__Feature::popcnt{})
};
($t:tt) => {
compile_error!(concat!("unknown target feature: ", $t))
};
}
/// X86 CPU Feature enum. Each variant denotes a position in a bitset for a
@ -74,15 +120,15 @@ pub enum __Feature {
bmi,
/// BMI1 (Bit Manipulation Instructions 2)
bmi2,
/// ABM (Advanced Bit Manipulation) on AMD / LZCNT (Leading Zero Count) on Intel
/// ABM (Advanced Bit Manipulation) on AMD / LZCNT (Leading Zero
/// Count) on Intel
abm,
/// TBM (Trailing Bit Manipulation)
tbm,
/// POPCNT (Population Count)
popcnt,
#[doc(hidden)]
__NonExhaustive
#[doc(hidden)] __NonExhaustive,
}
fn set_bit(x: usize, bit: u32) -> usize {
@ -102,14 +148,19 @@ fn inv_test_bit(v: usize, idx: u32) -> bool {
/// Run-time feature detection on x86 works by using the CPUID instruction.
///
/// The [CPUID Wikipedia page](https://en.wikipedia.org/wiki/CPUID) contains all
/// the information about which flags to set to query which values, and in which
/// registers these are reported.
/// The [CPUID Wikipedia page][wiki_cpuid] contains
/// all the information about which flags to set to query which values, and in
/// which registers these are reported.
///
/// The definitive references are:
/// - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2: Instruction Set Reference, A-Z](http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf).
/// - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and System Instructions](http://support.amd.com/TechDocs/24594.pdf).
/// - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
/// Instruction Set Reference, A-Z][intel64_ref].
/// - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
/// System Instructions][amd64_ref].
///
/// [wiki_cpuid]: https://en.wikipedia.org/wiki/CPUID
/// [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
/// [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
fn detect_features() -> usize {
let ebx;
let ecx;
@ -119,14 +170,16 @@ fn detect_features() -> usize {
/// To obtain all feature flags we need two CPUID queries:
/// 1. EAX=1, ECX=0: Queries "Processor Info and Feature Bits"
/// This gives us most of the CPU features in ECX and EDX (see below),
/// This gives us most of the CPU features in ECX and EDX (see
/// below),
asm!("cpuid"
: "={ecx}"(ecx), "={edx}"(edx)
: "{eax}"(0x00000001u32), "{ecx}"(0 as u32)
: :);
/// 2. EAX=7, ECX=0: Queries "Extended Features"
/// This gives us information about bmi,bmi2, and avx2 support (see below).
/// This gives us information about bmi,bmi2, and avx2 support
/// (see below).
asm!("cpuid"
: "={ebx}"(ebx)
: "{eax}"(0x00000007u32), "{ecx}"(0 as u32)
@ -135,36 +188,65 @@ fn detect_features() -> usize {
let mut value: usize = 0;
// CPUID call with EAX=7, ECX=0 => Extended Features in EBX and ECX (unneeded):
if inv_test_bit(ebx, 3) { value = set_bit(value, __Feature::bmi as u32); }
if inv_test_bit(ebx, 5) { value = set_bit(value, __Feature::avx2 as u32); }
if inv_test_bit(ebx, 8) { value = set_bit(value, __Feature::bmi2 as u32); }
// CPUID call with EAX=7, ECX=0 => Extended Features in EBX and ECX
// (the result in ECX is not currently needed):
if inv_test_bit(ebx, 3) {
value = set_bit(value, __Feature::bmi as u32);
}
if inv_test_bit(ebx, 5) {
value = set_bit(value, __Feature::avx2 as u32);
}
if inv_test_bit(ebx, 8) {
value = set_bit(value, __Feature::bmi2 as u32);
}
// CPUID call with EAX=1 => feature bits in ECX and EDX:
if inv_test_bit(ecx, 0) { value = set_bit(value, __Feature::sse3 as u32); }
if inv_test_bit(ecx, 5) { value = set_bit(value, __Feature::abm as u32); }
if inv_test_bit(ecx, 9) { value = set_bit(value, __Feature::ssse3 as u32); }
if inv_test_bit(ecx, 12) { value = set_bit(value, __Feature::fma as u32); }
if inv_test_bit(ecx, 19) { value = set_bit(value, __Feature::sse4_1 as u32); }
if inv_test_bit(ecx, 20) { value = set_bit(value, __Feature::sse4_2 as u32); }
if inv_test_bit(ecx, 21) { value = set_bit(value, __Feature::tbm as u32); }
if inv_test_bit(ecx, 23) { value = set_bit(value, __Feature::popcnt as u32); }
if inv_test_bit(ecx, 28) { value = set_bit(value, __Feature::avx as u32); }
if inv_test_bit(ecx, 0) {
value = set_bit(value, __Feature::sse3 as u32);
}
if inv_test_bit(ecx, 5) {
value = set_bit(value, __Feature::abm as u32);
}
if inv_test_bit(ecx, 9) {
value = set_bit(value, __Feature::ssse3 as u32);
}
if inv_test_bit(ecx, 12) {
value = set_bit(value, __Feature::fma as u32);
}
if inv_test_bit(ecx, 19) {
value = set_bit(value, __Feature::sse4_1 as u32);
}
if inv_test_bit(ecx, 20) {
value = set_bit(value, __Feature::sse4_2 as u32);
}
if inv_test_bit(ecx, 21) {
value = set_bit(value, __Feature::tbm as u32);
}
if inv_test_bit(ecx, 23) {
value = set_bit(value, __Feature::popcnt as u32);
}
if inv_test_bit(ecx, 28) {
value = set_bit(value, __Feature::avx as u32);
}
if inv_test_bit(edx, 25) { value = set_bit(value, __Feature::sse as u32); }
if inv_test_bit(edx, 26) { value = set_bit(value, __Feature::sse2 as u32); }
if inv_test_bit(edx, 25) {
value = set_bit(value, __Feature::sse as u32);
}
if inv_test_bit(edx, 26) {
value = set_bit(value, __Feature::sse2 as u32);
}
value
}
/// This global variable is a bitset used to cache the features supported by the
/// CPU.
/// This global variable is a bitset used to cache the features supported by
/// the CPU.
static FEATURES: AtomicUsize = AtomicUsize::new(::std::usize::MAX);
/// Performs run-time feature detection.
///
/// On its first invocation, it detects the CPU features and caches them in the
/// `FEATURES` global variable as an `AtomicUsize`.
/// On its first invocation, it detects the CPU features and caches them
/// in the `FEATURES` global variable as an `AtomicUsize`.
///
/// It uses the `__Feature` variant to index into this variable as a bitset. If
/// the bit is set, the feature is enabled, and otherwise it is disabled.
@ -172,7 +254,7 @@ static FEATURES: AtomicUsize = AtomicUsize::new(::std::usize::MAX);
/// PLEASE: do not use this, it is an implementation detail subject to change.
#[doc(hidden)]
pub fn __unstable_detect_feature(x: __Feature) -> bool {
if FEATURES.load(Ordering::Relaxed) == ::std::usize::MAX {
if FEATURES.load(Ordering::Relaxed) == ::std::usize::MAX {
FEATURES.store(detect_features(), Ordering::Relaxed);
}
test_bit(FEATURES.load(Ordering::Relaxed), x as u32)

File diff suppressed because it is too large Load diff

View file

@ -5,9 +5,8 @@ use std::mem;
use std::os::raw::c_void;
use std::ptr;
use simd_llvm::{
simd_cast, simd_shuffle2, simd_shuffle4, simd_shuffle8, simd_shuffle16,
};
use simd_llvm::{simd_cast, simd_shuffle16, simd_shuffle2, simd_shuffle4,
simd_shuffle8};
use x86::__m128i;
use v128::*;
use v64::*;
@ -317,7 +316,9 @@ pub unsafe fn _mm_subs_epu16(a: u16x8, b: u16x8) -> u16x8 {
#[cfg_attr(test, assert_instr(pslldq, imm8 = 1))]
pub unsafe fn _mm_slli_si128(a: __m128i, imm8: i32) -> __m128i {
let (zero, imm8) = (__m128i::splat(0), imm8 as u32);
const fn sub(a: u32, b: u32) -> u32 { a - b }
const fn sub(a: u32, b: u32) -> u32 {
a - b
}
macro_rules! shuffle {
($shift:expr) => {
simd_shuffle16::<__m128i, __m128i>(zero, a, [
@ -333,14 +334,22 @@ pub unsafe fn _mm_slli_si128(a: __m128i, imm8: i32) -> __m128i {
}
}
match imm8 {
0 => shuffle!(0), 1 => shuffle!(1),
2 => shuffle!(2), 3 => shuffle!(3),
4 => shuffle!(4), 5 => shuffle!(5),
6 => shuffle!(6), 7 => shuffle!(7),
8 => shuffle!(8), 9 => shuffle!(9),
10 => shuffle!(10), 11 => shuffle!(11),
12 => shuffle!(12), 13 => shuffle!(13),
14 => shuffle!(14), 15 => shuffle!(15),
0 => shuffle!(0),
1 => shuffle!(1),
2 => shuffle!(2),
3 => shuffle!(3),
4 => shuffle!(4),
5 => shuffle!(5),
6 => shuffle!(6),
7 => shuffle!(7),
8 => shuffle!(8),
9 => shuffle!(9),
10 => shuffle!(10),
11 => shuffle!(11),
12 => shuffle!(12),
13 => shuffle!(13),
14 => shuffle!(14),
15 => shuffle!(15),
_ => shuffle!(16),
}
}
@ -365,7 +374,7 @@ pub unsafe fn _mm_bsrli_si128(a: __m128i, imm8: i32) -> __m128i {
#[inline(always)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(psllw))]
pub unsafe fn _mm_slli_epi16(a: i16x8, imm8: i32) -> i16x8 {
pub unsafe fn _mm_slli_epi16(a: i16x8, imm8: i32) -> i16x8 {
pslliw(a, imm8)
}
@ -454,7 +463,9 @@ pub unsafe fn _mm_sra_epi32(a: i32x4, count: i32x4) -> i32x4 {
#[cfg_attr(test, assert_instr(psrldq, imm8 = 1))]
pub unsafe fn _mm_srli_si128(a: __m128i, imm8: i32) -> __m128i {
let (zero, imm8) = (__m128i::splat(0), imm8 as u32);
const fn add(a: u32, b: u32) -> u32 { a + b }
const fn add(a: u32, b: u32) -> u32 {
a + b
}
macro_rules! shuffle {
($shift:expr) => {
simd_shuffle16::<__m128i, __m128i>(a, zero, [
@ -470,14 +481,22 @@ pub unsafe fn _mm_srli_si128(a: __m128i, imm8: i32) -> __m128i {
}
}
match imm8 {
0 => shuffle!(0), 1 => shuffle!(1),
2 => shuffle!(2), 3 => shuffle!(3),
4 => shuffle!(4), 5 => shuffle!(5),
6 => shuffle!(6), 7 => shuffle!(7),
8 => shuffle!(8), 9 => shuffle!(9),
10 => shuffle!(10), 11 => shuffle!(11),
12 => shuffle!(12), 13 => shuffle!(13),
14 => shuffle!(14), 15 => shuffle!(15),
0 => shuffle!(0),
1 => shuffle!(1),
2 => shuffle!(2),
3 => shuffle!(3),
4 => shuffle!(4),
5 => shuffle!(5),
6 => shuffle!(6),
7 => shuffle!(7),
8 => shuffle!(8),
9 => shuffle!(9),
10 => shuffle!(10),
11 => shuffle!(11),
12 => shuffle!(12),
13 => shuffle!(13),
14 => shuffle!(14),
15 => shuffle!(15),
_ => shuffle!(16),
}
}
@ -487,7 +506,7 @@ pub unsafe fn _mm_srli_si128(a: __m128i, imm8: i32) -> __m128i {
#[inline(always)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(psrlw))]
pub unsafe fn _mm_srli_epi16(a: i16x8, imm8: i32) -> i16x8 {
pub unsafe fn _mm_srli_epi16(a: i16x8, imm8: i32) -> i16x8 {
psrliw(a, imm8)
}
@ -649,7 +668,7 @@ pub unsafe fn _mm_cmplt_epi32(a: i32x4, b: i32x4) -> i32x4 {
#[inline(always)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(cvtdq2pd))]
pub unsafe fn _mm_cvtepi32_pd(a: i32x4) -> f64x2 {
pub unsafe fn _mm_cvtepi32_pd(a: i32x4) -> f64x2 {
simd_cast::<i32x2, f64x2>(simd_shuffle2(a, a, [0, 1]))
}
@ -777,7 +796,7 @@ pub unsafe fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> i32x4 {
#[target_feature = "+sse2"]
// no particular instruction to test
pub unsafe fn _mm_set_epi16(
e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16,
e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16
) -> i16x8 {
i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7)
}
@ -790,6 +809,7 @@ pub unsafe fn _mm_set_epi8(
e15: i8, e14: i8, e13: i8, e12: i8, e11: i8, e10: i8, e9: i8, e8: i8,
e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8,
) -> i8x16 {
#[cfg_attr(rustfmt, rustfmt_skip)]
i8x16::new(
e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
)
@ -840,7 +860,7 @@ pub unsafe fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> i32x4 {
#[target_feature = "+sse2"]
// no particular instruction to test
pub unsafe fn _mm_setr_epi16(
e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16,
e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16
) -> i16x8 {
i16x8::new(e7, e6, e5, e4, e3, e2, e1, e0)
}
@ -853,6 +873,7 @@ pub unsafe fn _mm_setr_epi8(
e15: i8, e14: i8, e13: i8, e12: i8, e11: i8, e10: i8, e9: i8, e8: i8,
e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8,
) -> i8x16 {
#[cfg_attr(rustfmt, rustfmt_skip)]
i8x16::new(
e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0,
)
@ -895,7 +916,8 @@ pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
ptr::copy_nonoverlapping(
mem_addr as *const u8,
&mut dst as *mut __m128i as *mut u8,
mem::size_of::<__m128i>());
mem::size_of::<__m128i>(),
);
dst
}
@ -934,7 +956,8 @@ pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
ptr::copy_nonoverlapping(
&a as *const _ as *const u8,
mem_addr as *mut u8,
mem::size_of::<__m128i>());
mem::size_of::<__m128i>(),
);
}
/// Store the lower 64-bit integer `a` to a memory location.
@ -945,7 +968,10 @@ pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
// no particular instruction to test
pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
ptr::copy_nonoverlapping(
&a as *const _ as *const u8, mem_addr as *mut u8, 8);
&a as *const _ as *const u8,
mem_addr as *mut u8,
8,
);
}
/// Return a vector where the low element is extracted from `a` and its upper
@ -1076,7 +1102,9 @@ pub unsafe fn _mm_shuffle_epi32(a: i32x4, imm8: i32) -> i32x4 {
pub unsafe fn _mm_shufflehi_epi16(a: i16x8, imm8: i32) -> i16x8 {
// See _mm_shuffle_epi32.
let imm8 = (imm8 & 0xFF) as u8;
const fn add4(x: u32) -> u32 { x + 4 }
const fn add4(x: u32) -> u32 {
x + 4
}
macro_rules! shuffle_done {
($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
@ -1183,10 +1211,11 @@ pub unsafe fn _mm_shufflelo_epi16(a: i16x8, imm8: i32) -> i16x8 {
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(punpckhbw))]
pub unsafe fn _mm_unpackhi_epi8(a: i8x16, b: i8x16) -> i8x16 {
simd_shuffle16(a, b, [
8, 24, 9, 25, 10, 26, 11, 27,
12, 28, 13, 29, 14, 30, 15, 31,
])
simd_shuffle16(
a,
b,
[8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
)
}
/// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
@ -1218,10 +1247,11 @@ pub unsafe fn _mm_unpackhi_epi64(a: i64x2, b: i64x2) -> i64x2 {
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(punpcklbw))]
pub unsafe fn _mm_unpacklo_epi8(a: i8x16, b: i8x16) -> i8x16 {
simd_shuffle16(a, b, [
0, 16, 1, 17, 2, 18, 3, 19,
4, 20, 5, 21, 6, 22, 7, 23,
])
simd_shuffle16(
a,
b,
[0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
)
}
/// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
@ -1718,7 +1748,8 @@ pub unsafe fn _mm_ucomineq_sd(a: f64x2, b: f64x2) -> bool {
mem::transmute(ucomineqsd(a, b) as u8)
}
/// Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements
/// Convert packed double-precision (64-bit) floating-point elements in "a" to
/// packed single-precision (32-bit) floating-point elements
#[inline(always)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(cvtpd2ps))]
@ -1726,8 +1757,8 @@ pub unsafe fn _mm_cvtpd_ps(a: f64x2) -> f32x4 {
cvtpd2ps(a)
}
/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
/// Convert packed single-precision (32-bit) floating-point elements in `a` to
/// packed
/// double-precision (64-bit) floating-point elements.
#[inline(always)]
#[target_feature = "+sse2"]
@ -1736,7 +1767,8 @@ pub unsafe fn _mm_cvtps_pd(a: f32x4) -> f64x2 {
cvtps2pd(a)
}
/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers.
/// Convert packed double-precision (64-bit) floating-point elements in `a` to
/// packed 32-bit integers.
#[inline(always)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(cvtpd2dq))]
@ -1744,7 +1776,8 @@ pub unsafe fn _mm_cvtpd_epi32(a: f64x2) -> i32x4 {
cvtpd2dq(a)
}
/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer.
/// Convert the lower double-precision (64-bit) floating-point element in a to
/// a 32-bit integer.
#[inline(always)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(cvtsd2si))]
@ -1752,7 +1785,8 @@ pub unsafe fn _mm_cvtsd_si32(a: f64x2) -> i32 {
cvtsd2si(a)
}
/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer.
/// Convert the lower double-precision (64-bit) floating-point element in a to
/// a 64-bit integer.
#[cfg(target_arch = "x86_64")]
#[inline(always)]
#[target_feature = "+sse2"]
@ -1761,9 +1795,10 @@ pub unsafe fn _mm_cvtsd_si64(a: f64x2) -> i64 {
cvtsd2si64(a)
}
/// Convert the lower double-precision (64-bit) floating-point element in `b` to a
/// single-precision (32-bit) floating-point element, store the result in the lower element
/// of the return value, and copy the upper element from `a` to the upper element the return value.
/// Convert the lower double-precision (64-bit) floating-point element in `b`
/// to a single-precision (32-bit) floating-point element, store the result in
/// the lower element of the return value, and copy the upper element from `a`
/// to the upper element the return value.
#[inline(always)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(cvtsd2ss))]
@ -1771,18 +1806,19 @@ pub unsafe fn _mm_cvtsd_ss(a: f32x4, b: f64x2) -> f32x4 {
cvtsd2ss(a, b)
}
/// Convert the lower single-precision (32-bit) floating-point element in `b` to a
/// double-precision (64-bit) floating-point element, store the result in the lower element
/// of the return value, and copy the upper element from `a` to the upper element the return value.
/// Convert the lower single-precision (32-bit) floating-point element in `b`
/// to a double-precision (64-bit) floating-point element, store the result in
/// the lower element of the return value, and copy the upper element from `a`
/// to the upper element the return value.
#[inline(always)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(cvtss2sd))]
pub unsafe fn _mm_cvtss_sd(a: f64x2, b: f32x4 ) -> f64x2 {
pub unsafe fn _mm_cvtss_sd(a: f64x2, b: f32x4) -> f64x2 {
cvtss2sd(a, b)
}
/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
/// 32-bit integers with truncation.
/// Convert packed double-precision (64-bit) floating-point elements in `a` to
/// packed 32-bit integers with truncation.
#[inline(always)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(cvttpd2dq))]
@ -1790,8 +1826,8 @@ pub unsafe fn _mm_cvttpd_epi32(a: f64x2) -> i32x4 {
cvttpd2dq(a)
}
/// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer
/// with truncation.
/// Convert the lower double-precision (64-bit) floating-point element in `a`
/// to a 32-bit integer with truncation.
#[inline(always)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(cvttsd2si))]
@ -1799,8 +1835,8 @@ pub unsafe fn _mm_cvttsd_si32(a: f64x2) -> i32 {
cvttsd2si(a)
}
/// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer
/// with truncation.
/// Convert the lower double-precision (64-bit) floating-point element in `a`
/// to a 64-bit integer with truncation.
#[cfg(target_arch = "x86_64")]
#[inline(always)]
#[target_feature = "+sse2"]
@ -1809,8 +1845,8 @@ pub unsafe fn _mm_cvttsd_si64(a: f64x2) -> i64 {
cvttsd2si64(a)
}
/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit
/// integers with truncation
/// Convert packed single-precision (32-bit) floating-point elements in `a` to
/// packed 32-bit integers with truncation.
#[inline(always)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(cvttps2dq))]
@ -1818,45 +1854,48 @@ pub unsafe fn _mm_cvttps_epi32(a: f32x4) -> i32x4 {
cvttps2dq(a)
}
/// Copy double-precision (64-bit) floating-point element `a` to the lower element of the
/// packed 64-bit return value
/// Copy double-precision (64-bit) floating-point element `a` to the lower
/// element of the packed 64-bit return value.
#[inline(always)]
#[target_feature = "+sse2"]
pub unsafe fn _mm_set_sd(a: f64) -> f64x2 {
f64x2::new(a, 0_f64)
}
/// Broadcast double-precision (64-bit) floating-point value a to all elements of the return value
/// Broadcast double-precision (64-bit) floating-point value a to all elements
/// of the return value.
#[inline(always)]
#[target_feature = "+sse2"]
pub unsafe fn _mm_set1_pd(a: f64) -> f64x2 {
f64x2::new(a, a)
}
/// Broadcast double-precision (64-bit) floating-point value a to all elements of the return value
/// Broadcast double-precision (64-bit) floating-point value a to all elements
/// of the return value.
#[inline(always)]
#[target_feature = "+sse2"]
pub unsafe fn _mm_set_pd1(a: f64) -> f64x2 {
f64x2::new(a, a)
}
/// Set packed double-precision (64-bit) floating-point elements in the return value with the
/// supplied values.
/// Set packed double-precision (64-bit) floating-point elements in the return
/// value with the supplied values.
#[inline(always)]
#[target_feature = "+sse2"]
pub unsafe fn _mm_set_pd(a: f64, b: f64) -> f64x2 {
f64x2::new(b, a)
}
/// Set packed double-precision (64-bit) floating-point elements in the return value with the
/// supplied values in reverse order.
/// Set packed double-precision (64-bit) floating-point elements in the return
/// value with the supplied values in reverse order.
#[inline(always)]
#[target_feature = "+sse2"]
pub unsafe fn _mm_setr_pd(a: f64, b: f64) -> f64x2 {
f64x2::new(a, b)
}
/// returns packed double-precision (64-bit) floating-point elements with all zeros.
/// returns packed double-precision (64-bit) floating-point elements with all
/// zeros.
#[inline(always)]
#[target_feature = "+sse2"]
pub unsafe fn _mm_setzero_pd() -> f64x2 {
@ -1876,9 +1915,10 @@ pub unsafe fn _mm_movemask_pd(a: f64x2) -> i32 {
/// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements)
/// from memory into the returned vector. mem_addr must be aligned on a 16-byte boundary or
/// a general-protection exception may be generated.
/// Load 128-bits (composed of 2 packed double-precision (64-bit)
/// floating-point elements) from memory into the returned vector.
/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
/// exception may be generated.
#[inline(always)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(movaps))]
@ -1886,9 +1926,9 @@ pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> f64x2 {
*(mem_addr as *const f64x2)
}
/// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a`
/// into memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception
/// may be generated.
/// Store 128-bits (composed of 2 packed double-precision (64-bit)
/// floating-point elements) from `a` into memory. `mem_addr` must be aligned
/// on a 16-byte boundary or a general-protection exception may be generated.
#[inline(always)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(movaps))]
@ -1906,12 +1946,13 @@ pub unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: f64x2) {
ptr::copy_nonoverlapping(
&a as *const f64x2 as *const u8,
mem_addr as *mut u8,
mem::size_of::<f64x2>());
mem::size_of::<f64x2>(),
);
}
/// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous
/// elements in memory. `mem_addr` must be aligned on a 16-byte boundary or a general-protection
/// exception may be generated.
/// Store the lower double-precision (64-bit) floating-point element from `a`
/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
/// 16-byte boundary or a general-protection exception may be generated.
#[inline(always)]
#[target_feature = "+sse2"]
pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: f64x2) {
@ -1919,9 +1960,9 @@ pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: f64x2) {
*(mem_addr as *mut f64x2) = b;
}
/// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous
/// elements in memory. `mem_addr` must be aligned on a 16-byte boundary or a general-protection
/// exception may be generated.
/// Store the lower double-precision (64-bit) floating-point element from `a`
/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
/// 16-byte boundary or a general-protection exception may be generated.
#[inline(always)]
#[target_feature = "+sse2"]
pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: f64x2) {
@ -1929,8 +1970,10 @@ pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: f64x2) {
*(mem_addr as *mut f64x2) = b;
}
/// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse order.
/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
/// Store 2 double-precision (64-bit) floating-point elements from `a` into
/// memory in reverse order.
/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
/// exception may be generated.
#[inline(always)]
#[target_feature = "+sse2"]
pub unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: f64x2) {
@ -1956,9 +1999,9 @@ pub unsafe fn _mm_load_pd1(mem_addr: *const f64) -> f64x2 {
f64x2::new(d, d)
}
/// Load 2 double-precision (64-bit) floating-point elements from memory into the returned vector
/// in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection
/// exception may be generated.
/// Load 2 double-precision (64-bit) floating-point elements from memory into
/// the returned vector in reverse order. `mem_addr` must be aligned on a
/// 16-byte boundary or a general-protection exception may be generated.
#[inline(always)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(movapd))]
@ -1967,9 +2010,9 @@ pub unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> f64x2 {
simd_shuffle2(a, a, [1, 0])
}
/// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements)
/// from memory into the returned vector. mem_addr does not need to be aligned on any particular
/// oundary.
/// Load 128-bits (composed of 2 packed double-precision (64-bit)
/// floating-point elements) from memory into the returned vector.
/// `mem_addr` does not need to be aligned on any particular boundary.
#[inline(always)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(movups))]
@ -1978,7 +2021,8 @@ pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> f64x2 {
ptr::copy_nonoverlapping(
mem_addr as *const u8,
&mut dst as *mut f64x2 as *mut u8,
mem::size_of::<f64x2>());
mem::size_of::<f64x2>(),
);
dst
}
@ -1997,7 +2041,7 @@ pub unsafe fn _mm_undefined_si128() -> __m128i {
}
#[allow(improper_ctypes)]
extern {
extern "C" {
#[link_name = "llvm.x86.sse2.pause"]
fn pause();
#[link_name = "llvm.x86.sse2.clflush"]
@ -2145,7 +2189,7 @@ extern {
#[link_name = "llvm.x86.sse2.cvtsd2ss"]
fn cvtsd2ss(a: f32x4, b: f64x2) -> f32x4;
#[link_name = "llvm.x86.sse2.cvtss2sd"]
fn cvtss2sd(a: f64x2, b: f32x4 ) -> f64x2;
fn cvtss2sd(a: f64x2, b: f32x4) -> f64x2;
#[link_name = "llvm.x86.sse2.cvttpd2dq"]
fn cvttpd2dq(a: f64x2) -> i32x4;
#[link_name = "llvm.x86.sse2.cvttsd2si"]
@ -2160,7 +2204,7 @@ extern {
mod tests {
use std::os::raw::c_void;
use stdsimd_test::simd_test;
use test::black_box; // Used to inhibit constant-folding.
use test::black_box; // Used to inhibit constant-folding.
use v128::*;
use x86::{__m128i, sse2};
@ -2188,13 +2232,17 @@ mod tests {
#[simd_test = "sse2"]
unsafe fn _mm_add_epi8() {
let a = i8x16::new(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let a =
i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
#[cfg_attr(rustfmt, rustfmt_skip)]
let b = i8x16::new(
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
);
let r = sse2::_mm_add_epi8(a, b);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = i8x16::new(
16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46);
16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
);
assert_eq!(r, e);
}
@ -2235,13 +2283,17 @@ mod tests {
#[simd_test = "sse2"]
unsafe fn _mm_adds_epi8() {
let a = i8x16::new(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let a =
i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
#[cfg_attr(rustfmt, rustfmt_skip)]
let b = i8x16::new(
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
);
let r = sse2::_mm_adds_epi8(a, b);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = i8x16::new(
16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46);
16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
);
assert_eq!(r, e);
}
@ -2288,13 +2340,17 @@ mod tests {
#[simd_test = "sse2"]
unsafe fn _mm_adds_epu8() {
let a = u8x16::new(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let a =
u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
#[cfg_attr(rustfmt, rustfmt_skip)]
let b = u8x16::new(
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
);
let r = sse2::_mm_adds_epu8(a, b);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = u8x16::new(
16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46);
16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
);
assert_eq!(r, e);
}
@ -2410,12 +2466,11 @@ mod tests {
#[simd_test = "sse2"]
unsafe fn _mm_sad_epu8() {
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = u8x16::new(
255, 254, 253, 252, 1, 2, 3, 4,
155, 154, 153, 152, 1, 2, 3, 4);
let b = u8x16::new(
0, 0, 0, 0, 2, 1, 2, 1,
1, 1, 1, 1, 1, 2, 1, 2);
255, 254, 253, 252, 1, 2, 3, 4, 155, 154, 153, 152, 1, 2, 3, 4,
);
let b = u8x16::new(0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2);
let r = sse2::_mm_sad_epu8(a, b);
let e = u64x2::new(1020, 614);
assert_eq!(r, e);
@ -2527,44 +2582,58 @@ mod tests {
#[simd_test = "sse2"]
unsafe fn _mm_slli_si128() {
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = __m128i::new(
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
);
let r = sse2::_mm_slli_si128(a, 1);
let e = __m128i::new(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let e =
__m128i::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
assert_eq!(r, e);
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = __m128i::new(
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
);
let r = sse2::_mm_slli_si128(a, 15);
let e = __m128i::new(
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
let e = __m128i::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
assert_eq!(r, e);
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = __m128i::new(
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
);
let r = sse2::_mm_slli_si128(a, 16);
assert_eq!(r, __m128i::splat(0));
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = __m128i::new(
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
);
let r = sse2::_mm_slli_si128(a, -1);
assert_eq!(r, __m128i::splat(0));
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = __m128i::new(
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
);
let r = sse2::_mm_slli_si128(a, -0x80000000);
assert_eq!(r, __m128i::splat(0));
}
#[simd_test = "sse2"]
unsafe fn _mm_slli_epi16() {
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = i16x8::new(
0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0);
0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0,
);
let r = sse2::_mm_slli_epi16(a, 4);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = i16x8::new(
0xFFF0 as u16 as i16,
0xFFF0 as u16 as i16, 0x0FF0, 0x00F0, 0, 0, 0, 0);
0xFFF0 as u16 as i16, 0xFFF0 as u16 as i16, 0x0FF0, 0x00F0,
0, 0, 0, 0,
);
assert_eq!(r, e);
}
@ -2635,44 +2704,58 @@ mod tests {
#[simd_test = "sse2"]
unsafe fn _mm_srli_si128() {
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = __m128i::new(
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
);
let r = sse2::_mm_srli_si128(a, 1);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = __m128i::new(
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0);
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0,
);
assert_eq!(r, e);
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = __m128i::new(
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
);
let r = sse2::_mm_srli_si128(a, 15);
let e = __m128i::new(
16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
let e = __m128i::new(16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
assert_eq!(r, e);
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = __m128i::new(
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
);
let r = sse2::_mm_srli_si128(a, 16);
assert_eq!(r, __m128i::splat(0));
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = __m128i::new(
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
);
let r = sse2::_mm_srli_si128(a, -1);
assert_eq!(r, __m128i::splat(0));
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = __m128i::new(
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
);
let r = sse2::_mm_srli_si128(a, -0x80000000);
assert_eq!(r, __m128i::splat(0));
}
#[simd_test = "sse2"]
unsafe fn _mm_srli_epi16() {
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = i16x8::new(
0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0);
0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0,
);
let r = sse2::_mm_srli_epi16(a, 4);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = i16x8::new(
0xFFF as u16 as i16,
0xFF as u16 as i16, 0xF, 0, 0, 0, 0, 0);
0xFFF as u16 as i16, 0xFF as u16 as i16, 0xF, 0, 0, 0, 0, 0,
);
assert_eq!(r, e);
}
@ -2747,13 +2830,18 @@ mod tests {
#[simd_test = "sse2"]
unsafe fn _mm_cmpeq_epi8() {
let a = i8x16::new(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let b = i8x16::new(
15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
let a =
i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let b =
i8x16::new(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
let r = sse2::_mm_cmpeq_epi8(a, b);
assert_eq!(r, i8x16::new(
0, 0, 0xFFu8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
assert_eq!(
r,
#[cfg_attr(rustfmt, rustfmt_skip)]
i8x16::new(
0, 0, 0xFFu8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
)
);
}
#[simd_test = "sse2"]
@ -2902,18 +2990,12 @@ mod tests {
#[simd_test = "sse2"]
unsafe fn _mm_set_epi8() {
#[cfg_attr(rustfmt, rustfmt_skip)]
let r = sse2::_mm_set_epi8(
0, 1, 2, 3,
4, 5, 6, 7,
8, 9, 10, 11,
12, 13, 14, 15,
);
let e = i8x16::new(
15, 14, 13, 12,
11, 10, 9, 8,
7, 6, 5, 4,
3, 2, 1, 0,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
);
let e =
i8x16::new(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
assert_eq!(r, e);
}
@ -2955,18 +3037,12 @@ mod tests {
#[simd_test = "sse2"]
unsafe fn _mm_setr_epi8() {
#[cfg_attr(rustfmt, rustfmt_skip)]
let r = sse2::_mm_setr_epi8(
0, 1, 2, 3,
4, 5, 6, 7,
8, 9, 10, 11,
12, 13, 14, 15,
);
let e = i8x16::new(
0, 1, 2, 3,
4, 5, 6, 7,
8, 9, 10, 11,
12, 13, 14, 15,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
);
let e =
i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
assert_eq!(r, e);
}
@ -3042,9 +3118,13 @@ mod tests {
let a = i16x8::new(0x80, -0x81, 0, 0, 0, 0, 0, 0);
let b = i16x8::new(0, 0, 0, 0, 0, 0, -0x81, 0x80);
let r = sse2::_mm_packs_epi16(a, b);
assert_eq!(r, i8x16::new(
0x7F, -0x80, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, -0x80, 0x7F));
assert_eq!(
r,
#[cfg_attr(rustfmt, rustfmt_skip)]
i8x16::new(
0x7F, -0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0x80, 0x7F
)
);
}
#[simd_test = "sse2"]
@ -3053,7 +3133,9 @@ mod tests {
let b = i32x4::new(0, 0, -0x8001, 0x8000);
let r = sse2::_mm_packs_epi32(a, b);
assert_eq!(
r, i16x8::new(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF));
r,
i16x8::new(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF)
);
}
#[simd_test = "sse2"]
@ -3061,9 +3143,10 @@ mod tests {
let a = i16x8::new(0x100, -1, 0, 0, 0, 0, 0, 0);
let b = i16x8::new(0, 0, 0, 0, 0, 0, -1, 0x100);
let r = sse2::_mm_packus_epi16(a, b);
assert_eq!(r, u8x16::new(
0xFF, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0xFF));
assert_eq!(
r,
u8x16::new(0xFF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF)
);
}
#[simd_test = "sse2"]
@ -3082,9 +3165,9 @@ mod tests {
#[simd_test = "sse2"]
unsafe fn _mm_movemask_epi8() {
let a = i8x16::from(u8x16::new(
let a = i8x16::from(#[cfg_attr(rustfmt, rustfmt_skip)] u8x16::new(
0b1000_0000, 0b0, 0b1000_0000, 0b01, 0b0101, 0b1111_0000, 0, 0,
0, 0, 0b1111_0000, 0b0101, 0b01, 0b1000_0000, 0b0, 0b1000_0000));
0, 0, 0b1111_0000, 0b0101, 0b01, 0b1000_0000, 0b0, 0b1000_0000, ));
let r = sse2::_mm_movemask_epi8(a);
assert_eq!(r, 0b10100100_00100101);
}
@ -3115,13 +3198,17 @@ mod tests {
#[simd_test = "sse2"]
unsafe fn _mm_unpackhi_epi8() {
let a = i8x16::new(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let a =
i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
#[cfg_attr(rustfmt, rustfmt_skip)]
let b = i8x16::new(
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
);
let r = sse2::_mm_unpackhi_epi8(a, b);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = i8x16::new(
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
);
assert_eq!(r, e);
}
@ -3154,13 +3241,15 @@ mod tests {
#[simd_test = "sse2"]
unsafe fn _mm_unpacklo_epi8() {
let a = i8x16::new(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let a =
i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
#[cfg_attr(rustfmt, rustfmt_skip)]
let b = i8x16::new(
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
);
let r = sse2::_mm_unpacklo_epi8(a, b);
let e = i8x16::new(
0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
let e =
i8x16::new(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
assert_eq!(r, e);
}
@ -3825,7 +3914,7 @@ mod tests {
#[simd_test = "sse2"]
unsafe fn _mm_cvtpd_ps() {
use std::{f64,f32};
use std::{f32, f64};
let r = sse2::_mm_cvtpd_ps(f64x2::new(-1.0, 5.0));
assert_eq!(r, f32x4::new(-1.0, 5.0, 0.0, 0.0));
@ -3834,20 +3923,23 @@ mod tests {
assert_eq!(r, f32x4::new(-1.0, -5.0, 0.0, 0.0));
let r = sse2::_mm_cvtpd_ps(f64x2::new(f64::MAX, f64::MIN));
assert_eq!(r, f32x4::new(f32::INFINITY, f32::NEG_INFINITY, 0.0,0.0));
assert_eq!(r, f32x4::new(f32::INFINITY, f32::NEG_INFINITY, 0.0, 0.0));
let r = sse2::_mm_cvtpd_ps(f64x2::new(f32::MAX as f64, f32::MIN as f64));
assert_eq!(r, f32x4::new(f32::MAX, f32::MIN, 0.0,0.0));
let r =
sse2::_mm_cvtpd_ps(f64x2::new(f32::MAX as f64, f32::MIN as f64));
assert_eq!(r, f32x4::new(f32::MAX, f32::MIN, 0.0, 0.0));
}
#[simd_test = "sse2"]
unsafe fn _mm_cvtps_pd() {
use std::{f64, f32};
use std::{f32, f64};
let r = sse2::_mm_cvtps_pd(f32x4::new(-1.0, 2.0, -3.0, 5.0));
assert_eq!(r, f64x2::new(-1.0, 2.0));
let r = sse2::_mm_cvtps_pd(f32x4::new(f32::MAX, f32::INFINITY, f32::NEG_INFINITY, f32::MIN));
let r = sse2::_mm_cvtps_pd(
f32x4::new(f32::MAX, f32::INFINITY, f32::NEG_INFINITY, f32::MIN),
);
assert_eq!(r, f64x2::new(f32::MAX as f64, f64::INFINITY));
}
@ -3864,7 +3956,9 @@ mod tests {
let r = sse2::_mm_cvtpd_epi32(f64x2::new(f64::MAX, f64::MIN));
assert_eq!(r, i32x4::new(i32::MIN, i32::MIN, 0, 0));
let r = sse2::_mm_cvtpd_epi32(f64x2::new(f64::INFINITY, f64::NEG_INFINITY));
let r = sse2::_mm_cvtpd_epi32(
f64x2::new(f64::INFINITY, f64::NEG_INFINITY),
);
assert_eq!(r, i32x4::new(i32::MIN, i32::MIN, 0, 0));
let r = sse2::_mm_cvtpd_epi32(f64x2::new(f64::NAN, f64::NAN));
@ -3902,7 +3996,7 @@ mod tests {
#[simd_test = "sse2"]
unsafe fn _mm_cvtsd_ss() {
use std::{f64, f32};
use std::{f32, f64};
let a = f32x4::new(-1.1, -2.2, 3.3, 4.4);
let b = f64x2::new(2.0, -5.0);
@ -3911,17 +4005,26 @@ mod tests {
assert_eq!(r, f32x4::new(2.0, -2.2, 3.3, 4.4));
let a = f32x4::new(-1.1, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY);
let a =
f32x4::new(-1.1, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY);
let b = f64x2::new(f64::INFINITY, -5.0);
let r = sse2::_mm_cvtsd_ss(a, b);
assert_eq!(r, f32x4::new(f32::INFINITY, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY));
assert_eq!(
r,
f32x4::new(
f32::INFINITY,
f32::NEG_INFINITY,
f32::MAX,
f32::NEG_INFINITY
)
);
}
#[simd_test = "sse2"]
unsafe fn _mm_cvtss_sd() {
use std::{f64, f32};
use std::{f32, f64};
let a = f64x2::new(-1.1, 2.2);
let b = f32x4::new(1.0, 2.0, 3.0, 4.0);
@ -3984,7 +4087,8 @@ mod tests {
let r = sse2::_mm_cvttps_epi32(a);
assert_eq!(r, i32x4::new(-1, 2, -3, 6));
let a = f32x4::new(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX);
let a =
f32x4::new(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX);
let r = sse2::_mm_cvttps_epi32(a);
assert_eq!(r, i32x4::new(i32::MIN, i32::MIN, i32::MIN, i32::MIN));
}

View file

@ -106,7 +106,7 @@ pub unsafe fn _mm_moveldup_ps(a: f32x4) -> f32x4 {
}
#[allow(improper_ctypes)]
extern {
extern "C" {
#[link_name = "llvm.x86.sse3.addsub.ps"]
fn addsubps(a: f32x4, b: f32x4) -> f32x4;
#[link_name = "llvm.x86.sse3.addsub.pd"]
@ -129,7 +129,7 @@ mod tests {
use stdsimd_test::simd_test;
use v128::*;
use x86::sse3 as sse3;
use x86::sse3;
#[simd_test = "sse3"]
unsafe fn _mm_addsub_ps() {
@ -181,7 +181,8 @@ mod tests {
#[simd_test = "sse3"]
unsafe fn _mm_lddqu_si128() {
let a = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
let a =
i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
let r = sse3::_mm_lddqu_si128(&a);
assert_eq!(a, r);
}
@ -213,4 +214,4 @@ mod tests {
let r = sse3::_mm_loaddup_pd(&d);
assert_eq!(r, f64x2::new(d, d));
}
}
}

View file

@ -15,7 +15,7 @@ pub unsafe fn _mm_blendv_epi8(a: i8x16, b: i8x16, mask: i8x16) -> i8x16 {
#[inline(always)]
#[target_feature = "+sse4.1"]
#[cfg_attr(test, assert_instr(pblendw, imm8=0xF0))]
#[cfg_attr(test, assert_instr(pblendw, imm8 = 0xF0))]
pub unsafe fn _mm_blend_epi16(a: i16x8, b: i16x8, imm8: u8) -> i16x8 {
macro_rules! call {
($imm8:expr) => { pblendw(a, b, $imm8) }
@ -23,7 +23,8 @@ pub unsafe fn _mm_blend_epi16(a: i16x8, b: i16x8, imm8: u8) -> i16x8 {
constify_imm8!(imm8, call)
}
/// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask`
/// Blend packed double-precision (64-bit) floating-point elements from `a`
/// and `b` using `mask`
#[inline(always)]
#[target_feature = "+sse4.1"]
#[cfg_attr(test, assert_instr(blendvpd))]
@ -31,7 +32,8 @@ pub unsafe fn _mm_blendv_pd(a: f64x2, b: f64x2, mask: f64x2) -> f64x2 {
blendvpd(a, b, mask)
}
/// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using `mask`
/// Blend packed single-precision (32-bit) floating-point elements from `a`
/// and `b` using `mask`
#[inline(always)]
#[target_feature = "+sse4.1"]
#[cfg_attr(test, assert_instr(blendvps))]
@ -39,10 +41,11 @@ pub unsafe fn _mm_blendv_ps(a: f32x4, b: f32x4, mask: f32x4) -> f32x4 {
blendvps(a, b, mask)
}
/// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control mask `imm2`
/// Blend packed double-precision (64-bit) floating-point elements from `a`
/// and `b` using control mask `imm2`
#[inline(always)]
#[target_feature = "+sse4.1"]
#[cfg_attr(test, assert_instr(blendpd, imm2=0b10))]
#[cfg_attr(test, assert_instr(blendpd, imm2 = 0b10))]
pub unsafe fn _mm_blend_pd(a: f64x2, b: f64x2, imm2: u8) -> f64x2 {
macro_rules! call {
($imm2:expr) => { blendpd(a, b, $imm2) }
@ -50,10 +53,11 @@ pub unsafe fn _mm_blend_pd(a: f64x2, b: f64x2, imm2: u8) -> f64x2 {
constify_imm2!(imm2, call)
}
/// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using mask `imm4`
/// Blend packed single-precision (32-bit) floating-point elements from `a`
/// and `b` using mask `imm4`
#[inline(always)]
#[target_feature = "+sse4.1"]
#[cfg_attr(test, assert_instr(blendps, imm4=0b0101))]
#[cfg_attr(test, assert_instr(blendps, imm4 = 0b0101))]
pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: u8) -> f32x4 {
macro_rules! call {
($imm4:expr) => { blendps(a, b, $imm4) }
@ -61,11 +65,12 @@ pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: u8) -> f32x4 {
constify_imm4!(imm4, call)
}
/// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`
/// Extract a single-precision (32-bit) floating-point element from `a`,
/// selected with `imm8`
#[inline(always)]
#[target_feature = "+sse4.1"]
// TODO: Add test for Windows
#[cfg_attr(all(test, not(windows)), assert_instr(extractps, imm8=0))]
#[cfg_attr(all(test, not(windows)), assert_instr(extractps, imm8 = 0))]
pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> i32 {
mem::transmute(a.extract(imm8 as u32 & 0b11))
}
@ -73,7 +78,7 @@ pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> i32 {
/// Extract an 8-bit integer from `a` selected with `imm8`
#[inline(always)]
#[target_feature = "+sse4.1"]
#[cfg_attr(test, assert_instr(pextrb, imm8=0))]
#[cfg_attr(test, assert_instr(pextrb, imm8 = 0))]
pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: u8) -> i8 {
a.extract((imm8 & 0b1111) as u32)
}
@ -82,7 +87,7 @@ pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: u8) -> i8 {
#[inline(always)]
#[target_feature = "+sse4.1"]
// TODO: Add test for Windows
#[cfg_attr(all(test, not(windows)), assert_instr(pextrd, imm8=1))]
#[cfg_attr(all(test, not(windows)), assert_instr(pextrd, imm8 = 1))]
pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: u8) -> i32 {
a.extract((imm8 & 0b11) as u32)
}
@ -92,15 +97,16 @@ pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: u8) -> i32 {
#[inline(always)]
#[target_feature = "+sse4.1"]
// TODO: Add test for Windows
#[cfg_attr(all(test, not(windows)), assert_instr(pextrq, imm8=1))]
#[cfg_attr(all(test, not(windows)), assert_instr(pextrq, imm8 = 1))]
pub unsafe fn _mm_extract_epi64(a: i64x2, imm8: u8) -> i64 {
a.extract((imm8 & 0b1) as u32)
}
/// Select a single value in `a` to store at some position in `b`,
/// Select a single value in `a` to store at some position in `b`,
/// Then zero elements according to `imm8`.
///
/// `imm8` specifies which bits from operand `a` will be copied, which bits in the
///
/// `imm8` specifies which bits from operand `a` will be copied, which bits in
/// the
/// result they will be copied to, and which bits in the result will be
/// cleared. The following assignments are made:
///
@ -121,7 +127,7 @@ pub unsafe fn _mm_extract_epi64(a: i64x2, imm8: u8) -> i64 {
/// element is cleared.
#[inline(always)]
#[target_feature = "+sse4.1"]
#[cfg_attr(test, assert_instr(insertps, imm8=0b1010))]
#[cfg_attr(test, assert_instr(insertps, imm8 = 0b1010))]
pub unsafe fn _mm_insert_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
macro_rules! call {
($imm8:expr) => { insertps(a, b, $imm8) }
@ -129,59 +135,66 @@ pub unsafe fn _mm_insert_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
constify_imm8!(imm8, call)
}
/// Return a copy of `a` with the 8-bit integer from `i` inserted at a location specified by `imm8`.
/// Return a copy of `a` with the 8-bit integer from `i` inserted at a
/// location specified by `imm8`.
#[inline(always)]
#[target_feature = "+sse4.1"]
#[cfg_attr(test, assert_instr(pinsrb, imm8=0))]
#[cfg_attr(test, assert_instr(pinsrb, imm8 = 0))]
pub unsafe fn _mm_insert_epi8(a: i8x16, i: i8, imm8: u8) -> i8x16 {
a.replace((imm8 & 0b1111) as u32, i)
}
/// Return a copy of `a` with the 32-bit integer from `i` inserted at a location specified by `imm8`.
/// Return a copy of `a` with the 32-bit integer from `i` inserted at a
/// location specified by `imm8`.
#[inline(always)]
#[target_feature = "+sse4.1"]
#[cfg_attr(test, assert_instr(pinsrd, imm8=0))]
#[cfg_attr(test, assert_instr(pinsrd, imm8 = 0))]
pub unsafe fn _mm_insert_epi32(a: i32x4, i: i32, imm8: u8) -> i32x4 {
a.replace((imm8 & 0b11) as u32, i)
}
/// Return a copy of `a` with the 64-bit integer from `i` inserted at a location specified by `imm8`.
/// Return a copy of `a` with the 64-bit integer from `i` inserted at a
/// location specified by `imm8`.
#[cfg(target_arch = "x86_64")]
#[inline(always)]
#[target_feature = "+sse4.1"]
#[cfg_attr(test, assert_instr(pinsrq, imm8=0))]
#[cfg_attr(test, assert_instr(pinsrq, imm8 = 0))]
pub unsafe fn _mm_insert_epi64(a: i64x2, i: i64, imm8: u8) -> i64x2 {
a.replace((imm8 & 0b1) as u32, i)
}
/// Compare packed 8-bit integers in `a` and `b`,87 and return packed maximum values in dst.
/// Compare packed 8-bit integers in `a` and `b`,87 and return packed maximum
/// values in dst.
#[inline(always)]
#[target_feature = "+sse4.1"]
#[cfg_attr(test, assert_instr(pmaxsb, imm8=0))]
#[cfg_attr(test, assert_instr(pmaxsb, imm8 = 0))]
pub unsafe fn _mm_max_epi8(a: i8x16, b: i8x16) -> i8x16 {
pmaxsb(a, b)
}
/// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed maximum.
/// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed
/// maximum.
#[inline(always)]
#[target_feature = "+sse4.1"]
#[cfg_attr(test, assert_instr(pmaxuw, imm8=0))]
#[cfg_attr(test, assert_instr(pmaxuw, imm8 = 0))]
pub unsafe fn _mm_max_epu16(a: u16x8, b: u16x8) -> u16x8 {
pmaxuw(a, b)
}
// Compare packed 32-bit integers in `a` and `b`, and return packed maximum values.
// Compare packed 32-bit integers in `a` and `b`, and return packed maximum
// values.
#[inline(always)]
#[target_feature = "+sse4.1"]
#[cfg_attr(test, assert_instr(pmaxsd, imm8=0))]
#[cfg_attr(test, assert_instr(pmaxsd, imm8 = 0))]
pub unsafe fn _mm_max_epi32(a: i32x4, b: i32x4) -> i32x4 {
pmaxsd(a, b)
}
// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed maximum values.
// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed
// maximum values.
#[inline(always)]
#[target_feature = "+sse4.1"]
#[cfg_attr(test, assert_instr(pmaxud, imm8=0))]
#[cfg_attr(test, assert_instr(pmaxud, imm8 = 0))]
pub unsafe fn _mm_max_epu32(a: u32x4, b: u32x4) -> u32x4 {
pmaxud(a, b)
}
@ -221,7 +234,7 @@ pub unsafe fn _mm_dp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
}
#[allow(improper_ctypes)]
extern {
extern "C" {
#[link_name = "llvm.x86.sse41.pblendvb"]
fn pblendvb(a: i8x16, b: i8x16, mask: i8x16) -> i8x16;
#[link_name = "llvm.x86.sse41.blendvpd"]
@ -261,14 +274,18 @@ mod tests {
#[simd_test = "sse4.1"]
unsafe fn _mm_blendv_epi8() {
let a = i8x16::new(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let a =
i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
#[cfg_attr(rustfmt, rustfmt_skip)]
let b = i8x16::new(
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
let mask = i8x16::new(
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
);
let mask =
i8x16::new(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = i8x16::new(
0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31);
0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31,
);
assert_eq!(sse41::_mm_blendv_epi8(a, b, mask), e);
}
@ -286,7 +303,7 @@ mod tests {
unsafe fn _mm_blendv_ps() {
let a = f32x4::splat(0.0);
let b = f32x4::splat(1.0);
let mask = mem::transmute(i32x4::new(0,-1, 0, -1));
let mask = mem::transmute(i32x4::new(0, -1, 0, -1));
let r = sse41::_mm_blendv_ps(a, b, mask);
let e = f32x4::new(0.0, 1.0, 0.0, 1.0);
assert_eq!(r, e);
@ -330,7 +347,8 @@ mod tests {
#[simd_test = "sse4.1"]
unsafe fn _mm_extract_epi8() {
let a = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let a =
i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let r = sse41::_mm_extract_epi8(a, 1);
assert_eq!(r, 1);
let r = sse41::_mm_extract_epi8(a, 17);
@ -398,10 +416,22 @@ mod tests {
#[simd_test = "sse4.1"]
unsafe fn _mm_max_epi8() {
let a = i8x16::new(1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29, 32);
let b = i8x16::new(2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31);
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = i8x16::new(
1, 4, 5, 8, 9, 12, 13, 16,
17, 20, 21, 24, 25, 28, 29, 32,
);
#[cfg_attr(rustfmt, rustfmt_skip)]
let b = i8x16::new(
2, 3, 6, 7, 10, 11, 14, 15,
18, 19, 22, 23, 26, 27, 30, 31,
);
let r = sse41::_mm_max_epi8(a, b);
let e = i8x16::new(2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = i8x16::new(
2, 4, 6, 8, 10, 12, 14, 16,
18, 20, 22, 24, 26, 28, 30, 32,
);
assert_eq!(r, e);
}

View file

@ -15,7 +15,8 @@ pub const _SIDD_SWORD_OPS: i8 = 0b00000011;
/// For each character in `a`, find if it is in `b` *(Default)*
pub const _SIDD_CMP_EQUAL_ANY: i8 = 0b00000000;
/// For each character in `a`, determine if `b[0] <= c <= b[1] or b[1] <= c <= b[2]...`
/// For each character in `a`, determine if `b[0] <= c <= b[1] or b[1] <= c <=
/// b[2]...`
pub const _SIDD_CMP_RANGES: i8 = 0b00000100;
/// The strings defined by `a` and `b` are equal
pub const _SIDD_CMP_EQUAL_EACH: i8 = 0b00001000;
@ -46,11 +47,7 @@ pub const _SIDD_UNIT_MASK: i8 = 0b01000000;
#[inline(always)]
#[target_feature = "+sse4.2"]
#[cfg_attr(test, assert_instr(pcmpistrm, imm8 = 0))]
pub unsafe fn _mm_cmpistrm(
a: __m128i,
b: __m128i,
imm8: i8,
) -> u8x16 {
pub unsafe fn _mm_cmpistrm(a: __m128i, b: __m128i, imm8: i8) -> u8x16 {
macro_rules! call {
($imm8:expr) => { pcmpistrm128(a, b, $imm8) }
}
@ -58,9 +55,9 @@ pub unsafe fn _mm_cmpistrm(
}
/// Compare packed strings with implicit lengths in `a` and `b` using the
/// control in `imm8`, and return the generated index. Similar to [`_mm_cmpestri`]
/// with the excception that [`_mm_cmpestri`] requires the lengths of `a` and
/// `b` to be explicitly specified.
/// control in `imm8`, and return the generated index. Similar to
/// [`_mm_cmpestri`] with the excception that [`_mm_cmpestri`] requires the
/// lengths of `a` and `b` to be explicitly specified.
///
/// # Control modes
///
@ -105,7 +102,8 @@ pub unsafe fn _mm_cmpistrm(
/// use stdsimd::simd::u8x16;
/// use stdsimd::vendor::{__m128i, _mm_cmpistri, _SIDD_CMP_EQUAL_ORDERED};
///
/// let haystack = b"This is a long string of text data\r\n\tthat extends multiple lines";
/// let haystack = b"This is a long string of text data\r\n\tthat extends
/// multiple lines";
/// let needle = b"\r\n\t\0\0\0\0\0\0\0\0\0\0\0\0\0";
///
/// let a = __m128i::from(u8x16::load(needle, 0));
@ -171,8 +169,8 @@ pub unsafe fn _mm_cmpistrm(
/// # }
/// ```
///
/// Find the index of the first character in the haystack that is within a range
/// of characters.
/// Find the index of the first character in the haystack that is within a
/// range of characters.
///
/// ```
/// # #![feature(cfg_target_feature)]
@ -269,11 +267,7 @@ pub unsafe fn _mm_cmpistrm(
#[inline(always)]
#[target_feature = "+sse4.2"]
#[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
pub unsafe fn _mm_cmpistri(
a: __m128i,
b: __m128i,
imm8: i8,
) -> i32 {
pub unsafe fn _mm_cmpistri(a: __m128i, b: __m128i, imm8: i8) -> i32 {
macro_rules! call {
($imm8:expr) => { pcmpistri128(a, b, $imm8) }
}
@ -286,11 +280,7 @@ pub unsafe fn _mm_cmpistri(
#[inline(always)]
#[target_feature = "+sse4.2"]
#[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
pub unsafe fn _mm_cmpistrz(
a: __m128i,
b: __m128i,
imm8: i8,
) -> i32 {
pub unsafe fn _mm_cmpistrz(a: __m128i, b: __m128i, imm8: i8) -> i32 {
macro_rules! call {
($imm8:expr) => { pcmpistriz128(a, b, $imm8) }
}
@ -303,11 +293,7 @@ pub unsafe fn _mm_cmpistrz(
#[inline(always)]
#[target_feature = "+sse4.2"]
#[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
pub unsafe fn _mm_cmpistrc(
a: __m128i,
b: __m128i,
imm8: i8,
) -> i32 {
pub unsafe fn _mm_cmpistrc(a: __m128i, b: __m128i, imm8: i8) -> i32 {
macro_rules! call {
($imm8:expr) => { pcmpistric128(a, b, $imm8) }
}
@ -320,11 +306,7 @@ pub unsafe fn _mm_cmpistrc(
#[inline(always)]
#[target_feature = "+sse4.2"]
#[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
pub unsafe fn _mm_cmpistrs(
a: __m128i,
b: __m128i,
imm8: i8,
) -> i32 {
pub unsafe fn _mm_cmpistrs(a: __m128i, b: __m128i, imm8: i8) -> i32 {
macro_rules! call {
($imm8:expr) => { pcmpistris128(a, b, $imm8) }
}
@ -336,11 +318,7 @@ pub unsafe fn _mm_cmpistrs(
#[inline(always)]
#[target_feature = "+sse4.2"]
#[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
pub unsafe fn _mm_cmpistro(
a: __m128i,
b: __m128i,
imm8: i8,
) -> i32 {
pub unsafe fn _mm_cmpistro(a: __m128i, b: __m128i, imm8: i8) -> i32 {
macro_rules! call {
($imm8:expr) => { pcmpistrio128(a, b, $imm8) }
}
@ -353,11 +331,7 @@ pub unsafe fn _mm_cmpistro(
#[inline(always)]
#[target_feature = "+sse4.2"]
#[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
pub unsafe fn _mm_cmpistra(
a: __m128i,
b: __m128i,
imm8: i8,
) -> i32 {
pub unsafe fn _mm_cmpistra(a: __m128i, b: __m128i, imm8: i8) -> i32 {
macro_rules! call {
($imm8:expr) => { pcmpistria128(a, b, $imm8) }
}
@ -370,11 +344,7 @@ pub unsafe fn _mm_cmpistra(
#[target_feature = "+sse4.2"]
#[cfg_attr(test, assert_instr(pcmpestrm, imm8 = 0))]
pub unsafe fn _mm_cmpestrm(
a: __m128i,
la: i32,
b: __m128i,
lb: i32,
imm8: i8,
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
) -> u8x16 {
macro_rules! call {
($imm8:expr) => { pcmpestrm128(a, la, b, lb, $imm8) }
@ -383,9 +353,9 @@ pub unsafe fn _mm_cmpestrm(
}
/// Compare packed strings `a` and `b` with lengths `la` and `lb` using the
/// control in `imm8`, and return the generated index. Similar to [`_mm_cmpistri`]
/// with the excception that [`_mm_cmpistri`] implicityly determines the length of
/// `a` and `b`.
/// control in `imm8`, and return the generated index. Similar to
/// [`_mm_cmpistri`] with the excception that [`_mm_cmpistri`] implicityly
/// determines the length of `a` and `b`.
///
/// # Control modes
///
@ -468,11 +438,7 @@ pub unsafe fn _mm_cmpestrm(
#[target_feature = "+sse4.2"]
#[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
pub unsafe fn _mm_cmpestri(
a: __m128i,
la: i32,
b: __m128i,
lb: i32,
imm8: i8,
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
) -> i32 {
macro_rules! call {
($imm8:expr) => { pcmpestri128(a, la, b, lb, $imm8) }
@ -487,11 +453,7 @@ pub unsafe fn _mm_cmpestri(
#[target_feature = "+sse4.2"]
#[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
pub unsafe fn _mm_cmpestrz(
a: __m128i,
la: i32,
b: __m128i,
lb: i32,
imm8: i8,
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
) -> i32 {
macro_rules! call {
($imm8:expr) => { pcmpestriz128(a, la, b, lb, $imm8) }
@ -506,11 +468,7 @@ pub unsafe fn _mm_cmpestrz(
#[target_feature = "+sse4.2"]
#[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
pub unsafe fn _mm_cmpestrc(
a: __m128i,
la: i32,
b: __m128i,
lb: i32,
imm8: i8,
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
) -> i32 {
macro_rules! call {
($imm8:expr) => { pcmpestric128(a, la, b, lb, $imm8) }
@ -525,11 +483,7 @@ pub unsafe fn _mm_cmpestrc(
#[target_feature = "+sse4.2"]
#[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
pub unsafe fn _mm_cmpestrs(
a: __m128i,
la: i32,
b: __m128i,
lb: i32,
imm8: i8,
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
) -> i32 {
macro_rules! call {
($imm8:expr) => { pcmpestris128(a, la, b, lb, $imm8) }
@ -544,11 +498,7 @@ pub unsafe fn _mm_cmpestrs(
#[target_feature = "+sse4.2"]
#[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
pub unsafe fn _mm_cmpestro(
a: __m128i,
la: i32,
b: __m128i,
lb: i32,
imm8: i8,
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
) -> i32 {
macro_rules! call {
($imm8:expr) => { pcmpestrio128(a, la, b, lb, $imm8) }
@ -564,11 +514,7 @@ pub unsafe fn _mm_cmpestro(
#[target_feature = "+sse4.2"]
#[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
pub unsafe fn _mm_cmpestra(
a: __m128i,
la: i32,
b: __m128i,
lb: i32,
imm8: i8,
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
) -> i32 {
macro_rules! call {
($imm8:expr) => { pcmpestria128(a, la, b, lb, $imm8) }
@ -624,22 +570,35 @@ pub unsafe fn _mm_cmpgt_epi64(a: i64x2, b: i64x2) -> i64x2 {
}
#[allow(improper_ctypes)]
extern {
extern "C" {
// SSE 4.2 string and text comparison ops
#[link_name = "llvm.x86.sse42.pcmpestrm128"]
fn pcmpestrm128(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8) -> u8x16;
fn pcmpestrm128(
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
) -> u8x16;
#[link_name = "llvm.x86.sse42.pcmpestri128"]
fn pcmpestri128(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8) -> i32;
fn pcmpestri128(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8)
-> i32;
#[link_name = "llvm.x86.sse42.pcmpestriz128"]
fn pcmpestriz128(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8) -> i32;
fn pcmpestriz128(
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
) -> i32;
#[link_name = "llvm.x86.sse42.pcmpestric128"]
fn pcmpestric128(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8) -> i32;
fn pcmpestric128(
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
) -> i32;
#[link_name = "llvm.x86.sse42.pcmpestris128"]
fn pcmpestris128(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8) -> i32;
fn pcmpestris128(
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
) -> i32;
#[link_name = "llvm.x86.sse42.pcmpestrio128"]
fn pcmpestrio128(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8) -> i32;
fn pcmpestrio128(
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
) -> i32;
#[link_name = "llvm.x86.sse42.pcmpestria128"]
fn pcmpestria128(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8) -> i32;
fn pcmpestria128(
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
) -> i32;
#[link_name = "llvm.x86.sse42.pcmpistrm128"]
fn pcmpistrm128(a: __m128i, b: __m128i, imm8: i8) -> u8x16;
#[link_name = "llvm.x86.sse42.pcmpistri128"]
@ -685,7 +644,8 @@ mod tests {
ptr::copy_nonoverlapping(
s.get_unchecked(0) as *const u8 as *const u8,
slice.get_unchecked_mut(0) as *mut u8 as *mut u8,
s.len());
s.len(),
);
__m128i::from(u8x16::load(slice, 0))
}
@ -694,8 +654,11 @@ mod tests {
let a = str_to_m128i(b"Hello! Good-Bye!");
let b = str_to_m128i(b"hello! good-bye!");
let i = sse42::_mm_cmpistrm(a, b, sse42::_SIDD_UNIT_MASK);
let res = u8x16::new(0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00,
0xff, 0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff);
#[cfg_attr(rustfmt, rustfmt_skip)]
let res = u8x16::new(
0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00,
0xff, 0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff,
);
assert_eq!(i, res);
}
@ -733,14 +696,23 @@ mod tests {
#[simd_test = "sse4.2"]
unsafe fn _mm_cmpistro() {
let a_bytes = u8x16::new(0x00, 0x47, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c,
0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
let b_bytes = u8x16::new(0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c,
0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
#[cfg_attr(rustfmt, rustfmt_skip)]
let a_bytes = u8x16::new(
0x00, 0x47, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c,
0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
);
#[cfg_attr(rustfmt, rustfmt_skip)]
let b_bytes = u8x16::new(
0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c,
0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
);
let a = __m128i::from(a_bytes);
let b = __m128i::from(b_bytes);
let i = sse42::_mm_cmpistro(
a, b, sse42::_SIDD_UWORD_OPS | sse42::_SIDD_UNIT_MASK);
a,
b,
sse42::_SIDD_UWORD_OPS | sse42::_SIDD_UNIT_MASK,
);
assert_eq!(0, i);
}
@ -757,15 +729,20 @@ mod tests {
let a = str_to_m128i(b"Hello!");
let b = str_to_m128i(b"Hello.");
let i = sse42::_mm_cmpestrm(a, 5, b, 5, sse42::_SIDD_UNIT_MASK);
assert_eq!(i, u8x16::new(0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00));
#[cfg_attr(rustfmt, rustfmt_skip)]
let r = u8x16::new(
0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
);
assert_eq!(i, r);
}
#[simd_test = "sse4.2"]
unsafe fn _mm_cmpestri() {
let a = str_to_m128i(b"bar - garbage");
let b = str_to_m128i(b"foobar");
let i = sse42::_mm_cmpestri(a, 3, b, 6, sse42::_SIDD_CMP_EQUAL_ORDERED);
let i =
sse42::_mm_cmpestri(a, 3, b, 6, sse42::_SIDD_CMP_EQUAL_ORDERED);
assert_eq!(3, i);
}
@ -773,8 +750,8 @@ mod tests {
unsafe fn _mm_cmpestrz() {
let a = str_to_m128i(b"");
let b = str_to_m128i(b"Hello");
let i = sse42::_mm_cmpestrz(
a, 16, b, 6, sse42::_SIDD_CMP_EQUAL_ORDERED);
let i =
sse42::_mm_cmpestrz(a, 16, b, 6, sse42::_SIDD_CMP_EQUAL_ORDERED);
assert_eq!(1, i);
}
@ -782,19 +759,20 @@ mod tests {
unsafe fn _mm_cmpestrc() {
let va = str_to_m128i(b"!!!!!!!!");
let vb = str_to_m128i(b" ");
let i = sse42::_mm_cmpestrc(
va, 7, vb, 7, sse42::_SIDD_UNIT_MASK);
let i = sse42::_mm_cmpestrc(va, 7, vb, 7, sse42::_SIDD_UNIT_MASK);
assert_eq!(0, i);
}
#[simd_test = "sse4.2"]
unsafe fn _mm_cmpestrs() {
let a_bytes = u8x16::new(0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c,
0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
#[cfg_attr(rustfmt, rustfmt_skip)]
let a_bytes = u8x16::new(
0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c,
0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
);
let a = __m128i::from(a_bytes);
let b = __m128i::from(u8x16::splat(0x00));
let i = sse42::_mm_cmpestrs(
a, 8, b, 0, sse42::_SIDD_UWORD_OPS);
let i = sse42::_mm_cmpestrs(a, 8, b, 0, sse42::_SIDD_UWORD_OPS);
assert_eq!(0, i);
}
@ -802,8 +780,7 @@ mod tests {
unsafe fn _mm_cmpestro() {
let a = str_to_m128i(b"Hello");
let b = str_to_m128i(b"World");
let i = sse42::_mm_cmpestro(
a, 5, b, 5, sse42::_SIDD_UBYTE_OPS);
let i = sse42::_mm_cmpestro(a, 5, b, 5, sse42::_SIDD_UBYTE_OPS);
assert_eq!(0, i);
}
@ -812,7 +789,12 @@ mod tests {
let a = str_to_m128i(b"Cannot match a");
let b = str_to_m128i(b"Null after 14");
let i = sse42::_mm_cmpestra(
a, 14, b, 16, sse42::_SIDD_CMP_EQUAL_EACH | sse42::_SIDD_UNIT_MASK);
a,
14,
b,
16,
sse42::_SIDD_CMP_EQUAL_EACH | sse42::_SIDD_UNIT_MASK,
);
assert_eq!(1, i);
}

View file

@ -13,7 +13,8 @@ pub unsafe fn _mm_abs_epi8(a: i8x16) -> u8x16 {
pabsb128(a)
}
/// Compute the absolute value of each of the packed 16-bit signed integers in `a` and
/// Compute the absolute value of each of the packed 16-bit signed integers in
/// `a` and
/// return the 16-bit unsigned integer
#[inline(always)]
#[target_feature = "+ssse3"]
@ -22,7 +23,8 @@ pub unsafe fn _mm_abs_epi16(a: i16x8) -> u16x8 {
pabsw128(a)
}
/// Compute the absolute value of each of the packed 32-bit signed integers in `a` and
/// Compute the absolute value of each of the packed 32-bit signed integers in
/// `a` and
/// return the 32-bit unsigned integer
#[inline(always)]
#[target_feature = "+ssse3"]
@ -82,7 +84,9 @@ pub unsafe fn _mm_alignr_epi8(a: i8x16, b: i8x16, n: i32) -> i8x16 {
(a, b, n)
};
const fn add(a: u32, b: u32) -> u32 { a + b }
const fn add(a: u32, b: u32) -> u32 {
a + b
}
macro_rules! shuffle {
($shift:expr) => {
simd_shuffle16(b, a, [
@ -98,14 +102,22 @@ pub unsafe fn _mm_alignr_epi8(a: i8x16, b: i8x16, n: i32) -> i8x16 {
}
}
match n {
0 => shuffle!(0), 1 => shuffle!(1),
2 => shuffle!(2), 3 => shuffle!(3),
4 => shuffle!(4), 5 => shuffle!(5),
6 => shuffle!(6), 7 => shuffle!(7),
8 => shuffle!(8), 9 => shuffle!(9),
10 => shuffle!(10), 11 => shuffle!(11),
12 => shuffle!(12), 13 => shuffle!(13),
14 => shuffle!(14), 15 => shuffle!(15),
0 => shuffle!(0),
1 => shuffle!(1),
2 => shuffle!(2),
3 => shuffle!(3),
4 => shuffle!(4),
5 => shuffle!(5),
6 => shuffle!(6),
7 => shuffle!(7),
8 => shuffle!(8),
9 => shuffle!(9),
10 => shuffle!(10),
11 => shuffle!(11),
12 => shuffle!(12),
13 => shuffle!(13),
14 => shuffle!(14),
15 => shuffle!(15),
_ => shuffle!(16),
}
}
@ -223,7 +235,7 @@ pub unsafe fn _mm_sign_epi32(a: i32x4, b: i32x4) -> i32x4 {
}
#[allow(improper_ctypes)]
extern {
extern "C" {
#[link_name = "llvm.x86.ssse3.pabs.b.128"]
fn pabsb128(a: i8x16) -> u8x16;
@ -275,7 +287,7 @@ mod tests {
use stdsimd_test::simd_test;
use v128::*;
use x86::ssse3 as ssse3;
use x86::ssse3;
#[simd_test = "ssse3"]
unsafe fn _mm_abs_epi8() {
@ -297,44 +309,36 @@ mod tests {
#[simd_test = "ssse3"]
unsafe fn _mm_shuffle_epi8() {
let a = u8x16::new(
1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 14, 15, 16,
);
let b = u8x16::new(
4, 128, 4, 3,
24, 12, 6, 19,
12, 5, 5, 10,
4, 1, 8, 0,
);
let expected = u8x16::new(
5, 0, 5, 4,
9, 13, 7, 4,
13, 6, 6, 11,
5, 2, 9, 1,
);
let a =
u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
let b =
u8x16::new(4, 128, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0);
let expected =
u8x16::new(5, 0, 5, 4, 9, 13, 7, 4, 13, 6, 6, 11, 5, 2, 9, 1);
let r = ssse3::_mm_shuffle_epi8(a, b);
assert_eq!(r, expected);
}
#[simd_test = "ssse3"]
unsafe fn _mm_alignr_epi8() {
let a = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
let b = i8x16::new(4, 63, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0);
let a =
i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
let b =
i8x16::new(4, 63, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0);
let r = ssse3::_mm_alignr_epi8(a, b, 33);
assert_eq!(r, i8x16::splat(0));
let r = ssse3::_mm_alignr_epi8(a, b, 17);
let expected = i8x16::new(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0);
let expected =
i8x16::new(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0);
assert_eq!(r, expected);
let r = ssse3::_mm_alignr_epi8(a, b, 16);
assert_eq!(r, a);
let r = ssse3::_mm_alignr_epi8(a, b, 15);
let expected = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let expected =
i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
assert_eq!(r, expected);
let r = ssse3::_mm_alignr_epi8(a, b, 0);
@ -397,8 +401,10 @@ mod tests {
#[simd_test = "ssse3"]
unsafe fn _mm_maddubs_epi16() {
let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
let b = i8x16::new(4, 63, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0);
let a =
u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
let b =
i8x16::new(4, 63, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0);
let expected = i16x8::new(130, 24, 192, 194, 158, 175, 66, 120);
let r = ssse3::_mm_maddubs_epi16(a, b);
assert_eq!(r, expected);
@ -415,9 +421,21 @@ mod tests {
#[simd_test = "ssse3"]
unsafe fn _mm_sign_epi8() {
let a = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -14, -15, 16);
let b = i8x16::new(4, 63, -4, 3, 24, 12, -6, -19, 12, 5, -5, 10, 4, 1, -8, 0);
let expected = i8x16::new(1, 2, -3, 4, 5, 6, -7, -8, 9, 10, -11, 12, 13, -14, 15, 0);
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = i8x16::new(
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, -14, -15, 16,
);
#[cfg_attr(rustfmt, rustfmt_skip)]
let b = i8x16::new(
4, 63, -4, 3, 24, 12, -6, -19,
12, 5, -5, 10, 4, 1, -8, 0,
);
#[cfg_attr(rustfmt, rustfmt_skip)]
let expected = i8x16::new(
1, 2, -3, 4, 5, 6, -7, -8,
9, 10, -11, 12, 13, -14, 15, 0,
);
let r = ssse3::_mm_sign_epi8(a, b);
assert_eq!(r, expected);
}

View file

@ -1,16 +1,21 @@
//! Trailing Bit Manipulation (TBM) instruction set.
//!
//! The reference is [AMD64 Architecture Programmer's Manual, Volume 3:
//! General-Purpose and System
//! Instructions](http://support.amd.com/TechDocs/24594.pdf).
//! General-Purpose and System Instructions][amd64_ref].
//!
//! [Wikipedia](https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#TBM_.28Trailing_Bit_Manipulation.29)
//! provides a quick overview of the available instructions.
//! [Wikipedia][wikipedia_bmi] provides a quick overview of the available
//! instructions.
//!
//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
//! [wikipedia_bmi]:
//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.
//! 28Advanced_Bit_Manipulation.29
#[cfg(test)]
use stdsimd_test::assert_instr;
// TODO: LLVM-CODEGEN ERROR: LLVM ERROR: Cannot select: intrinsic %llvm.x86.tbm.bextri.u32
// TODO: LLVM-CODEGEN ERROR: LLVM ERROR: Cannot select:
// intrinsic %llvm.x86.tbm.bextri.u32
/*
#[allow(dead_code)]
extern "C" {
@ -39,8 +44,8 @@ pub fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 {
/// Extracts bits of `a` specified by `control` into
/// the least significant bits of the result.
///
/// Bits [7,0] of `control` specify the index to the first bit in the range to be
/// extracted, and bits [15,8] specify the length of the range.
/// Bits [7,0] of `control` specify the index to the first bit in the range to
/// be extracted, and bits [15,8] specify the length of the range.
#[inline(always)]
#[target_feature = "+tbm"]
pub fn _bextr2_u32(a: u32, control: u32) -> u32 {
@ -50,8 +55,8 @@ pub fn _bextr2_u32(a: u32, control: u32) -> u32 {
/// Extracts bits of `a` specified by `control` into
/// the least significant bits of the result.
///
/// Bits [7,0] of `control` specify the index to the first bit in the range to be
/// extracted, and bits [15,8] specify the length of the range.
/// Bits [7,0] of `control` specify the index to the first bit in the range to
/// be extracted, and bits [15,8] specify the length of the range.
#[inline(always)]
#[target_feature = "+tbm"]
pub fn _bextr2_u64(a: u64, control: u64) -> u64 {
@ -122,7 +127,8 @@ pub unsafe fn _blcic_u64(x: u64) -> u64 {
!x & (x.wrapping_add(1))
}
/// Sets the least significant zero bit of `x` and clears all bits above that bit.
/// Sets the least significant zero bit of `x` and clears all bits above
/// that bit.
///
/// If there is no zero bit in `x`, it sets all the bits.
#[inline(always)]
@ -132,7 +138,8 @@ pub unsafe fn _blcmsk_u32(x: u32) -> u32 {
x ^ (x.wrapping_add(1))
}
/// Sets the least significant zero bit of `x` and clears all bits above that bit.
/// Sets the least significant zero bit of `x` and clears all bits above
/// that bit.
///
/// If there is no zero bit in `x`, it sets all the bits.
#[inline(always)]
@ -272,162 +279,152 @@ mod tests {
#[simd_test = "tbm"]
unsafe fn _blcfill_u32() {
assert_eq!(
tbm::_blcfill_u32(0b0101_0111u32),
0b0101_0000u32);
assert_eq!(
tbm::_blcfill_u32(0b1111_1111u32),
0u32);
assert_eq!(tbm::_blcfill_u32(0b0101_0111u32), 0b0101_0000u32);
assert_eq!(tbm::_blcfill_u32(0b1111_1111u32), 0u32);
}
#[simd_test = "tbm"]
#[cfg(not(target_arch = "x86"))]
unsafe fn _blcfill_u64() {
assert_eq!(
tbm::_blcfill_u64(0b0101_0111u64),
0b0101_0000u64);
assert_eq!(
tbm::_blcfill_u64(0b1111_1111u64),
0u64);
assert_eq!(tbm::_blcfill_u64(0b0101_0111u64), 0b0101_0000u64);
assert_eq!(tbm::_blcfill_u64(0b1111_1111u64), 0u64);
}
#[simd_test = "tbm"]
unsafe fn _blci_u32() {
assert_eq!(
tbm::_blci_u32(0b0101_0000u32),
0b1111_1111_1111_1111_1111_1111_1111_1110u32);
0b1111_1111_1111_1111_1111_1111_1111_1110u32
);
assert_eq!(
tbm::_blci_u32(0b1111_1111u32),
0b1111_1111_1111_1111_1111_1110_1111_1111u32);
0b1111_1111_1111_1111_1111_1110_1111_1111u32
);
}
#[simd_test = "tbm"]
#[cfg(not(target_arch = "x86"))]
#[cfg_attr(rustfmt, rustfmt_skip)]
unsafe fn _blci_u64() {
assert_eq!(
tbm::_blci_u64(0b0101_0000u64),
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110u64);
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110u64
);
assert_eq!(
tbm::_blci_u64(0b1111_1111u64),
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110_1111_1111u64);
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110_1111_1111u64
);
}
#[simd_test = "tbm"]
unsafe fn _blcic_u32() {
assert_eq!(
tbm::_blcic_u32(0b0101_0001u32),
0b0000_0010u32);
assert_eq!(
tbm::_blcic_u32(0b1111_1111u32),
0b1_0000_0000u32);
assert_eq!(tbm::_blcic_u32(0b0101_0001u32), 0b0000_0010u32);
assert_eq!(tbm::_blcic_u32(0b1111_1111u32), 0b1_0000_0000u32);
}
#[simd_test = "tbm"]
#[cfg(not(target_arch = "x86"))]
unsafe fn _blcic_u64() {
assert_eq!(
tbm::_blcic_u64(0b0101_0001u64),
0b0000_0010u64);
assert_eq!(
tbm::_blcic_u64(0b1111_1111u64),
0b1_0000_0000u64);
assert_eq!(tbm::_blcic_u64(0b0101_0001u64), 0b0000_0010u64);
assert_eq!(tbm::_blcic_u64(0b1111_1111u64), 0b1_0000_0000u64);
}
#[simd_test = "tbm"]
unsafe fn _blcmsk_u32() {
assert_eq!(
tbm::_blcmsk_u32(0b0101_0001u32),
0b0000_0011u32);
assert_eq!(
tbm::_blcmsk_u32(0b1111_1111u32),
0b1_1111_1111u32);
assert_eq!(tbm::_blcmsk_u32(0b0101_0001u32), 0b0000_0011u32);
assert_eq!(tbm::_blcmsk_u32(0b1111_1111u32), 0b1_1111_1111u32);
}
#[simd_test = "tbm"]
#[cfg(not(target_arch = "x86"))]
unsafe fn _blcmsk_u64() {
assert_eq!(
tbm::_blcmsk_u64(0b0101_0001u64),
0b0000_0011u64);
assert_eq!(
tbm::_blcmsk_u64(0b1111_1111u64),
0b1_1111_1111u64);
assert_eq!(tbm::_blcmsk_u64(0b0101_0001u64), 0b0000_0011u64);
assert_eq!(tbm::_blcmsk_u64(0b1111_1111u64), 0b1_1111_1111u64);
}
#[simd_test = "tbm"]
unsafe fn _blcs_u32() {
assert_eq!(tbm::_blcs_u32(0b0101_0001u32), 0b0101_0011u32);
assert_eq!(tbm::_blcs_u32(0b1111_1111u32), 0b1_1111_1111u32);
assert_eq!(tbm::_blcs_u32(0b0101_0001u32), 0b0101_0011u32);
assert_eq!(tbm::_blcs_u32(0b1111_1111u32), 0b1_1111_1111u32);
}
#[simd_test = "tbm"]
#[cfg(not(target_arch = "x86"))]
unsafe fn _blcs_u64() {
assert_eq!(tbm::_blcs_u64(0b0101_0001u64), 0b0101_0011u64);
assert_eq!(tbm::_blcs_u64(0b1111_1111u64), 0b1_1111_1111u64);
assert_eq!(tbm::_blcs_u64(0b0101_0001u64), 0b0101_0011u64);
assert_eq!(tbm::_blcs_u64(0b1111_1111u64), 0b1_1111_1111u64);
}
#[simd_test = "tbm"]
unsafe fn _blsfill_u32() {
assert_eq!(
tbm::_blsfill_u32(0b0101_0100u32),
0b0101_0111u32);
assert_eq!(tbm::_blsfill_u32(0b0101_0100u32), 0b0101_0111u32);
assert_eq!(
tbm::_blsfill_u32(0u32),
0b1111_1111_1111_1111_1111_1111_1111_1111u32);
0b1111_1111_1111_1111_1111_1111_1111_1111u32
);
}
#[simd_test = "tbm"]
#[cfg(not(target_arch = "x86"))]
#[cfg_attr(rustfmt, rustfmt_skip)]
unsafe fn _blsfill_u64() {
assert_eq!(
tbm::_blsfill_u64(0b0101_0100u64),
0b0101_0111u64);
assert_eq!(tbm::_blsfill_u64(0b0101_0100u64), 0b0101_0111u64);
assert_eq!(
tbm::_blsfill_u64(0u64),
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64
);
}
#[simd_test = "tbm"]
unsafe fn _blsic_u32() {
assert_eq!(
tbm::_blsic_u32(0b0101_0100u32),
0b1111_1111_1111_1111_1111_1111_1111_1011u32);
0b1111_1111_1111_1111_1111_1111_1111_1011u32
);
assert_eq!(
tbm::_blsic_u32(0u32),
0b1111_1111_1111_1111_1111_1111_1111_1111u32);
0b1111_1111_1111_1111_1111_1111_1111_1111u32
);
}
#[simd_test = "tbm"]
#[cfg(not(target_arch = "x86"))]
#[cfg_attr(rustfmt, rustfmt_skip)]
unsafe fn _blsic_u64() {
assert_eq!(
tbm::_blsic_u64(0b0101_0100u64),
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1011u64);
assert_eq!(
tbm::_blsic_u64(0u64),
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1011u64
);
assert_eq!(
tbm::_blsic_u64(0u64),
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64
);
}
#[simd_test = "tbm"]
unsafe fn _t1mskc_u32() {
assert_eq!(
tbm::_t1mskc_u32(0b0101_0111u32),
0b1111_1111_1111_1111_1111_1111_1111_1000u32);
assert_eq!(
tbm::_t1mskc_u32(0u32),
0b1111_1111_1111_1111_1111_1111_1111_1111u32);
assert_eq!(
tbm::_t1mskc_u32(0b0101_0111u32),
0b1111_1111_1111_1111_1111_1111_1111_1000u32
);
assert_eq!(
tbm::_t1mskc_u32(0u32),
0b1111_1111_1111_1111_1111_1111_1111_1111u32
);
}
#[simd_test = "tbm"]
#[cfg(not(target_arch = "x86"))]
#[cfg_attr(rustfmt, rustfmt_skip)]
unsafe fn _t1mksc_u64() {
assert_eq!(
tbm::_t1mskc_u64(0b0101_0111u64),
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1000u64);
assert_eq!(
tbm::_t1mskc_u64(0u64),
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
assert_eq!(
tbm::_t1mskc_u64(0b0101_0111u64),
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1000u64
);
assert_eq!(
tbm::_t1mskc_u64(0u64),
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64
);
}
#[simd_test = "tbm"]

View file

@ -2,7 +2,10 @@ use std::env;
fn main() {
println!("cargo:rerun-if-changed=build.rs");
let opt_level = env::var("OPT_LEVEL").ok().and_then(|s| s.parse().ok()).unwrap_or(0);
let opt_level = env::var("OPT_LEVEL")
.ok()
.and_then(|s| s.parse().ok())
.unwrap_or(0);
let profile = env::var("PROFILE").unwrap_or(String::new());
if profile == "release" || opt_level >= 2 {
println!("cargo:rustc-cfg=optimized");

View file

@ -21,13 +21,13 @@ extern crate synom;
use proc_macro2::TokenStream;
#[proc_macro_attribute]
pub fn assert_instr(attr: proc_macro::TokenStream,
item: proc_macro::TokenStream)
-> proc_macro::TokenStream
{
pub fn assert_instr(
attr: proc_macro::TokenStream, item: proc_macro::TokenStream
) -> proc_macro::TokenStream {
let invoc = syn::parse::<Invoc>(attr)
.expect("expected #[assert_instr(instr, a = b, ...)]");
let item = syn::parse::<syn::Item>(item).expect("must be attached to an item");
let item =
syn::parse::<syn::Item>(item).expect("must be attached to an item");
let func = match item.node {
syn::ItemKind::Fn(ref f) => f,
_ => panic!("must be attached to a function"),
@ -40,10 +40,11 @@ pub fn assert_instr(attr: proc_macro::TokenStream,
(quote! { #[ignore] }).into()
};
let name = &func.ident;
let assert_name = syn::Ident::from(&format!("assert_{}_{}",
name.sym.as_str(),
instr.sym.as_str())[..]);
let shim_name = syn::Ident::from(&format!("{}_shim", name.sym.as_str())[..]);
let assert_name = syn::Ident::from(
&format!("assert_{}_{}", name.sym.as_str(), instr.sym.as_str())[..],
);
let shim_name =
syn::Ident::from(&format!("{}_shim", name.sym.as_str())[..]);
let (to_test, test_name) = if invoc.args.len() == 0 {
(TokenStream::empty(), &func.ident)
} else {
@ -69,16 +70,29 @@ pub fn assert_instr(attr: proc_macro::TokenStream,
}
};
}
let attrs = item.attrs.iter().filter(|attr| {
attr.path.segments.get(0).item().ident.sym.as_str().starts_with("target")
}).collect::<Vec<_>>();
let attrs = item.attrs
.iter()
.filter(|attr| {
attr.path
.segments
.get(0)
.item()
.ident
.sym
.as_str()
.starts_with("target")
})
.collect::<Vec<_>>();
let attrs = Append(&attrs);
(quote! {
#attrs
unsafe fn #shim_name(#(#inputs),*) #ret {
#name(#(#input_vals),*)
}
}.into(), &shim_name)
(
quote! {
#attrs
unsafe fn #shim_name(#(#inputs),*) #ret {
#name(#(#input_vals),*)
}
}.into(),
&shim_name,
)
};
let tts: TokenStream = quote! {
@ -128,8 +142,9 @@ impl synom::Synom for Invoc {
struct Append<T>(T);
impl<T> quote::ToTokens for Append<T>
where T: Clone + IntoIterator,
T::Item: quote::ToTokens
where
T: Clone + IntoIterator,
T::Item: quote::ToTokens,
{
fn to_tokens(&self, tokens: &mut quote::Tokens) {
for item in self.0.clone() {

View file

@ -1,16 +1,16 @@
//! Implementation of the `#[simd_test]` macro
//!
//! This macro expands to a `#[test]` function which tests the local machine for
//! the appropriate cfg before calling the inner test function.
//! This macro expands to a `#[test]` function which tests the local machine
//! for the appropriate cfg before calling the inner test function.
#![feature(proc_macro)]
extern crate proc_macro2;
extern crate proc_macro;
#[macro_use]
extern crate quote;
extern crate proc_macro;
extern crate proc_macro2;
use proc_macro2::{TokenStream, Term, TokenNode, TokenTree};
use proc_macro2::{Term, TokenNode, TokenStream, TokenTree};
use proc_macro2::Literal;
fn string(s: &str) -> TokenTree {
@ -22,8 +22,9 @@ fn string(s: &str) -> TokenTree {
}
#[proc_macro_attribute]
pub fn simd_test(attr: proc_macro::TokenStream,
item: proc_macro::TokenStream) -> proc_macro::TokenStream {
pub fn simd_test(
attr: proc_macro::TokenStream, item: proc_macro::TokenStream
) -> proc_macro::TokenStream {
let tokens = TokenStream::from(attr).into_iter().collect::<Vec<_>>();
if tokens.len() != 2 {
panic!("expected #[simd_test = \"feature\"]");
@ -37,8 +38,9 @@ pub fn simd_test(attr: proc_macro::TokenStream,
TokenNode::Literal(ref l) => l.to_string(),
_ => panic!("expected #[simd_test = \"feature\"]"),
};
let enable_feature = enable_feature.trim_left_matches('"')
.trim_right_matches('"');
let enable_feature = enable_feature
.trim_left_matches('"')
.trim_right_matches('"');
let enable_feature = string(&format!("+{}", enable_feature));
let item = TokenStream::from(item);
let name = find_name(item.clone());
@ -67,7 +69,7 @@ fn find_name(item: TokenStream) -> Term {
while let Some(tok) = tokens.next() {
if let TokenNode::Term(word) = tok.kind {
if word.as_str() == "fn" {
break
break;
}
}
}

View file

@ -7,12 +7,12 @@
#![feature(proc_macro)]
extern crate assert_instr_macro;
extern crate simd_test_macro;
extern crate backtrace;
extern crate cc;
extern crate rustc_demangle;
#[macro_use]
extern crate lazy_static;
extern crate rustc_demangle;
extern crate simd_test_macro;
use std::collections::HashMap;
use std::env;
@ -23,7 +23,8 @@ pub use assert_instr_macro::*;
pub use simd_test_macro::*;
lazy_static! {
static ref DISASSEMBLY: HashMap<String, Vec<Function>> = disassemble_myself();
static ref DISASSEMBLY: HashMap<String, Vec<Function>>
= disassemble_myself();
}
struct Function {
@ -37,14 +38,22 @@ struct Instruction {
fn disassemble_myself() -> HashMap<String, Vec<Function>> {
let me = env::current_exe().expect("failed to get current exe");
if cfg!(target_arch = "x86_64") &&
cfg!(target_os = "windows") &&
cfg!(target_env = "msvc") {
let mut cmd = cc::windows_registry::find("x86_64-pc-windows-msvc", "dumpbin.exe")
.expect("failed to find `dumpbin` tool");
let output = cmd.arg("/DISASM").arg(&me).output()
if cfg!(target_arch = "x86_64") && cfg!(target_os = "windows")
&& cfg!(target_env = "msvc")
{
let mut cmd = cc::windows_registry::find(
"x86_64-pc-windows-msvc",
"dumpbin.exe",
).expect("failed to find `dumpbin` tool");
let output = cmd.arg("/DISASM")
.arg(&me)
.output()
.expect("failed to execute dumpbin");
println!("{}\n{}", output.status, String::from_utf8_lossy(&output.stderr));
println!(
"{}\n{}",
output.status,
String::from_utf8_lossy(&output.stderr)
);
assert!(output.status.success());
parse_dumpbin(&String::from_utf8_lossy(&output.stdout))
} else if cfg!(target_os = "windows") {
@ -55,7 +64,11 @@ fn disassemble_myself() -> HashMap<String, Vec<Function>> {
.arg(&me)
.output()
.expect("failed to execute otool");
println!("{}\n{}", output.status, String::from_utf8_lossy(&output.stderr));
println!(
"{}\n{}",
output.status,
String::from_utf8_lossy(&output.stderr)
);
assert!(output.status.success());
parse_otool(&str::from_utf8(&output.stdout).expect("stdout not utf8"))
@ -66,10 +79,16 @@ fn disassemble_myself() -> HashMap<String, Vec<Function>> {
.arg(&me)
.output()
.expect("failed to execute objdump");
println!("{}\n{}", output.status, String::from_utf8_lossy(&output.stderr));
println!(
"{}\n{}",
output.status,
String::from_utf8_lossy(&output.stderr)
);
assert!(output.status.success());
parse_objdump(&str::from_utf8(&output.stdout).expect("stdout not utf8"))
parse_objdump(
&str::from_utf8(&output.stdout).expect("stdout not utf8"),
)
}
}
@ -91,7 +110,7 @@ fn parse_objdump(output: &str) -> HashMap<String, Vec<Function>> {
while let Some(header) = lines.next() {
// symbols should start with `$hex_addr <$name>:`
if !header.ends_with(">:") {
continue
continue;
}
let start = header.find("<").unwrap();
let symbol = &header[start + 1..header.len() - 2];
@ -99,15 +118,17 @@ fn parse_objdump(output: &str) -> HashMap<String, Vec<Function>> {
let mut instructions = Vec::new();
while let Some(instruction) = lines.next() {
if instruction.is_empty() {
break
break;
}
// Each line of instructions should look like:
//
// $rel_offset: ab cd ef 00 $instruction...
let parts = instruction.split_whitespace()
let parts = instruction
.split_whitespace()
.skip(1)
.skip_while(|s| {
s.len() == expected_len && usize::from_str_radix(s, 16).is_ok()
s.len() == expected_len
&& usize::from_str_radix(s, 16).is_ok()
})
.map(|s| s.to_string())
.collect::<Vec<String>>();
@ -116,10 +137,12 @@ fn parse_objdump(output: &str) -> HashMap<String, Vec<Function>> {
ret.entry(normalize(symbol))
.or_insert(Vec::new())
.push(Function { instrs: instructions });
.push(Function {
instrs: instructions,
});
}
return ret
return ret;
}
fn parse_otool(output: &str) -> HashMap<String, Vec<Function>> {
@ -138,7 +161,7 @@ fn parse_otool(output: &str) -> HashMap<String, Vec<Function>> {
};
// symbols should start with `$symbol:`
if !header.ends_with(":") {
continue
continue;
}
// strip the leading underscore and the trailing colon
let symbol = &header[1..header.len() - 1];
@ -147,12 +170,13 @@ fn parse_otool(output: &str) -> HashMap<String, Vec<Function>> {
while let Some(instruction) = lines.next() {
if instruction.ends_with(":") {
cached_header = Some(instruction);
break
break;
}
// Each line of instructions should look like:
//
// $addr $instruction...
let parts = instruction.split_whitespace()
let parts = instruction
.split_whitespace()
.skip(1)
.map(|s| s.to_string())
.collect::<Vec<String>>();
@ -161,10 +185,12 @@ fn parse_otool(output: &str) -> HashMap<String, Vec<Function>> {
ret.entry(normalize(symbol))
.or_insert(Vec::new())
.push(Function { instrs: instructions });
.push(Function {
instrs: instructions,
});
}
return ret
return ret;
}
fn parse_dumpbin(output: &str) -> HashMap<String, Vec<Function>> {
@ -183,7 +209,7 @@ fn parse_dumpbin(output: &str) -> HashMap<String, Vec<Function>> {
};
// symbols should start with `$symbol:`
if !header.ends_with(":") {
continue
continue;
}
// strip the trailing colon
let symbol = &header[..header.len() - 1];
@ -192,20 +218,21 @@ fn parse_dumpbin(output: &str) -> HashMap<String, Vec<Function>> {
while let Some(instruction) = lines.next() {
if !instruction.starts_with(" ") {
cached_header = Some(instruction);
break
break;
}
// Each line looks like:
//
// > $addr: ab cd ef $instr..
// > 00 12 # this line os optional
if instruction.starts_with(" ") {
continue
continue;
}
let parts = instruction.split_whitespace()
let parts = instruction
.split_whitespace()
.skip(1)
.skip_while(|s| {
s.len() == 2 && usize::from_str_radix(s, 16).is_ok()
})
.skip_while(
|s| s.len() == 2 && usize::from_str_radix(s, 16).is_ok(),
)
.map(|s| s.to_string())
.collect::<Vec<String>>();
instructions.push(Instruction { parts });
@ -213,10 +240,12 @@ fn parse_dumpbin(output: &str) -> HashMap<String, Vec<Function>> {
ret.entry(normalize(symbol))
.or_insert(Vec::new())
.push(Function { instrs: instructions });
.push(Function {
instrs: instructions,
});
}
return ret
return ret;
}
fn normalize(symbol: &str) -> String {
@ -266,7 +295,7 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
// instruction: tzcntl => tzcnt and compares that.
if part.starts_with(expected) {
found = true;
break
break;
}
}
}
@ -274,7 +303,7 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
let probably_only_one_instruction = function.instrs.len() < 30;
if found && probably_only_one_instruction {
return
return;
}
// Help debug by printing out the found disassembly, and then panic as we

View file

@ -1,10 +1,10 @@
#![cfg_attr(feature = "strict", deny(warnings))]
#![feature(cfg_target_feature)]
extern crate cupid;
#[macro_use]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
extern crate stdsimd;
extern crate cupid;
#[test]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]