From 69d2ad85f3e361a7c61cf4aa8697c39bf70b6afc Mon Sep 17 00:00:00 2001 From: gnzlbg Date: Fri, 27 Oct 2017 17:55:29 +0200 Subject: [PATCH] [ci] check formatting (#64) * [ci] check formatting * [rustfmt] reformat the whole library --- library/stdarch/.travis.yml | 14 +- library/stdarch/examples/nbody.rs | 119 ++-- library/stdarch/examples/play.rs | 8 +- library/stdarch/rustfmt.toml | 5 + library/stdarch/src/arm/mod.rs | 4 +- library/stdarch/src/arm/v6.rs | 19 +- library/stdarch/src/arm/v7.rs | 14 +- library/stdarch/src/arm/v7_neon.rs | 88 +-- library/stdarch/src/arm/v8.rs | 37 +- library/stdarch/src/lib.rs | 43 +- library/stdarch/src/macros.rs | 15 +- library/stdarch/src/v128.rs | 18 +- library/stdarch/src/v256.rs | 18 +- library/stdarch/src/v512.rs | 19 +- library/stdarch/src/v64.rs | 4 +- library/stdarch/src/x86/abm.rs | 32 +- library/stdarch/src/x86/avx.rs | 494 ++++++++------ library/stdarch/src/x86/avx2.rs | 538 ++++++++++------ library/stdarch/src/x86/bmi.rs | 25 +- library/stdarch/src/x86/bmi2.rs | 54 +- library/stdarch/src/x86/macros.rs | 2 - library/stdarch/src/x86/mod.rs | 2 +- library/stdarch/src/x86/runtime.rs | 190 ++++-- library/stdarch/src/x86/sse.rs | 603 +++++++++++------- library/stdarch/src/x86/sse2.rs | 482 ++++++++------ library/stdarch/src/x86/sse3.rs | 9 +- library/stdarch/src/x86/sse41.rs | 114 ++-- library/stdarch/src/x86/sse42.rs | 202 +++--- library/stdarch/src/x86/ssse3.rs | 98 +-- library/stdarch/src/x86/tbm.rs | 159 +++-- .../stdsimd-test/assert-instr-macro/build.rs | 5 +- .../assert-instr-macro/src/lib.rs | 55 +- .../stdsimd-test/simd-test-macro/src/lib.rs | 22 +- library/stdarch/stdsimd-test/src/lib.rs | 99 ++- library/stdarch/tests/cpu-detection.rs | 2 +- 35 files changed, 2207 insertions(+), 1405 deletions(-) create mode 100644 library/stdarch/rustfmt.toml diff --git a/library/stdarch/.travis.yml b/library/stdarch/.travis.yml index 3c2dd7b2d866..ed8cbdae308d 100644 --- a/library/stdarch/.travis.yml +++ b/library/stdarch/.travis.yml @@ -17,9 +17,21 @@ matrix: script: ci/run.sh - install: true script: ci/dox.sh + - env: RUSTFMT=On TARGET=x86_64-unknown-linux-gnu NO_ADD=1 + script: | + cargo install rustfmt-nightly + cargo fmt -- --write-mode=diff + cd stdsimd + cargo fmt -- --write-mode=diff + cd assert-instr-macro + cargo fmt -- --write-mode=diff + cd ../simd-test-macro + cargo fmt -- --write-mode=diff + allow_failures: + - env: RUSTFMT=On TARGET=x86_64-unknown-linux-gnu NO_ADD=1 install: - - if [ "$NO_ADD" = "" ]; then rustup target add $TARGET; fi + - if [ "$NO_ADD" == "" ]; then rustup target add $TARGET; fi script: - cargo generate-lockfile diff --git a/library/stdarch/examples/nbody.rs b/library/stdarch/examples/nbody.rs index 3f6e7ccffe9c..a9baa74ff76c 100644 --- a/library/stdarch/examples/nbody.rs +++ b/library/stdarch/examples/nbody.rs @@ -26,7 +26,8 @@ impl Frsqrt for f64x2 { let u = unsafe { vendor::_mm_rsqrt_ps( - f32x4::new(t.extract(0), t.extract(1), 0., 0.)).as_f64x4() + f32x4::new(t.extract(0), t.extract(1), 0., 0.), + ).as_f64x4() }; f64x2::new(u.extract(0), u.extract(1)) } @@ -36,11 +37,12 @@ impl Frsqrt for f64x2 { use self::stdsimd::vendor; unsafe { vendor::vrsqrte_f32(self.as_f32x2()).as_f64x2() } } - #[cfg(not(any(all(any(target_arch = "x86", target_arch = "x86_64"), + #[cfg(not(any(all(any(target_arch = "x86", + target_arch = "x86_64"), target_feature = "sse"), - all(any(target_arch = "arm", target_arch = "aarch64"), - target_feature = "neon") - )))] + all(any(target_arch = "arm", + target_arch = "aarch64"), + target_feature = "neon"))))] { self.replace(0, 1. / self.extract(0).sqrt()); self.replace(1, 1. / self.extract(1).sqrt()); @@ -57,9 +59,9 @@ struct Body { } impl Body { - fn new(x0: f64, x1: f64, x2: f64, - v0: f64, v1: f64, v2: f64, - mass: f64) -> Body { + fn new( + x0: f64, x1: f64, x2: f64, v0: f64, v1: f64, v2: f64, mass: f64 + ) -> Body { Body { x: [x0, x1, x2], _fill: 0.0, @@ -91,7 +93,7 @@ fn advance(bodies: &mut [Body; N_BODIES], dt: f64) { let mut i = 0; for j in 0..N_BODIES { - for k in j+1..N_BODIES { + for k in j + 1..N_BODIES { for m in 0..3 { r[i][m] = bodies[j].x[m] - bodies[k].x[m]; } @@ -102,14 +104,15 @@ fn advance(bodies: &mut [Body; N_BODIES], dt: f64) { i = 0; while i < N { for m in 0..3 { - dx[m] = f64x2::new(r[i][m], r[i+1][m]); + dx[m] = f64x2::new(r[i][m], r[i + 1][m]); } dsquared = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2]; distance = dsquared.frsqrt(); for _ in 0..2 { - distance = distance * f64x2::splat(1.5) - - ((f64x2::splat(0.5) * dsquared) * distance) * (distance * distance) + distance = distance * f64x2::splat(1.5) + - ((f64x2::splat(0.5) * dsquared) * distance) + * (distance * distance) } dmag = f64x2::splat(dt) / dsquared * distance; dmag.store(&mut mag, i); @@ -119,7 +122,7 @@ fn advance(bodies: &mut [Body; N_BODIES], dt: f64) { i = 0; for j in 0..N_BODIES { - for k in j+1..N_BODIES { + for k in j + 1..N_BODIES { for m in 0..3 { bodies[j].v[m] -= r[i][m] * bodies[k].mass * mag[i]; bodies[k].v[m] += r[i][m] * bodies[j].mass * mag[i]; @@ -138,15 +141,19 @@ fn energy(bodies: &[Body; N_BODIES]) -> f64 { let mut e = 0.0; for i in 0..N_BODIES { let bi = &bodies[i]; - e += bi.mass * (bi.v[0] * bi.v[0] + bi.v[1] * bi.v[1] + bi.v[2] * bi.v[2]) / 2.0; - for j in i+1..N_BODIES { + e += bi.mass + * (bi.v[0] * bi.v[0] + bi.v[1] * bi.v[1] + bi.v[2] * bi.v[2]) + / 2.0; + for j in i + 1..N_BODIES { let bj = &bodies[j]; let mut dx = [0.0; 3]; for k in 0..3 { dx[k] = bi.x[k] - bj.x[k]; } let mut distance = 0.0; - for &d in &dx { distance += d * d } + for &d in &dx { + distance += d * d + } e -= bi.mass * bj.mass / distance.sqrt() } } @@ -156,48 +163,54 @@ fn energy(bodies: &[Body; N_BODIES]) -> f64 { fn main() { let mut bodies: [Body; N_BODIES] = [ /* sun */ - Body::new(0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, - SOLAR_MASS), + Body::new(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, SOLAR_MASS), /* jupiter */ - Body::new(4.84143144246472090e+00, - -1.16032004402742839e+00, - -1.03622044471123109e-01 , - 1.66007664274403694e-03 * DAYS_PER_YEAR, - 7.69901118419740425e-03 * DAYS_PER_YEAR, - -6.90460016972063023e-05 * DAYS_PER_YEAR , - 9.54791938424326609e-04 * SOLAR_MASS - ), + Body::new( + 4.84143144246472090e+00, + -1.16032004402742839e+00, + -1.03622044471123109e-01, + 1.66007664274403694e-03 * DAYS_PER_YEAR, + 7.69901118419740425e-03 * DAYS_PER_YEAR, + -6.90460016972063023e-05 * DAYS_PER_YEAR, + 9.54791938424326609e-04 * SOLAR_MASS, + ), /* saturn */ - Body::new(8.34336671824457987e+00, - 4.12479856412430479e+00, - -4.03523417114321381e-01 , - -2.76742510726862411e-03 * DAYS_PER_YEAR, - 4.99852801234917238e-03 * DAYS_PER_YEAR, - 2.30417297573763929e-05 * DAYS_PER_YEAR , - 2.85885980666130812e-04 * SOLAR_MASS - ), + Body::new( + 8.34336671824457987e+00, + 4.12479856412430479e+00, + -4.03523417114321381e-01, + -2.76742510726862411e-03 * DAYS_PER_YEAR, + 4.99852801234917238e-03 * DAYS_PER_YEAR, + 2.30417297573763929e-05 * DAYS_PER_YEAR, + 2.85885980666130812e-04 * SOLAR_MASS, + ), /* uranus */ - Body::new(1.28943695621391310e+01, - -1.51111514016986312e+01, - -2.23307578892655734e-01 , - 2.96460137564761618e-03 * DAYS_PER_YEAR, - 2.37847173959480950e-03 * DAYS_PER_YEAR, - -2.96589568540237556e-05 * DAYS_PER_YEAR , - 4.36624404335156298e-05 * SOLAR_MASS - ), + Body::new( + 1.28943695621391310e+01, + -1.51111514016986312e+01, + -2.23307578892655734e-01, + 2.96460137564761618e-03 * DAYS_PER_YEAR, + 2.37847173959480950e-03 * DAYS_PER_YEAR, + -2.96589568540237556e-05 * DAYS_PER_YEAR, + 4.36624404335156298e-05 * SOLAR_MASS, + ), /* neptune */ - Body::new(1.53796971148509165e+01, - -2.59193146099879641e+01, - 1.79258772950371181e-01 , - 2.68067772490389322e-03 * DAYS_PER_YEAR, - 1.62824170038242295e-03 * DAYS_PER_YEAR, - -9.51592254519715870e-05 * DAYS_PER_YEAR , - 5.15138902046611451e-05 * SOLAR_MASS - ) - ]; + Body::new( + 1.53796971148509165e+01, + -2.59193146099879641e+01, + 1.79258772950371181e-01, + 2.68067772490389322e-03 * DAYS_PER_YEAR, + 1.62824170038242295e-03 * DAYS_PER_YEAR, + -9.51592254519715870e-05 * DAYS_PER_YEAR, + 5.15138902046611451e-05 * SOLAR_MASS, + ), + ]; - let n: usize = std::env::args().nth(1).expect("need one arg").parse().unwrap(); + let n: usize = std::env::args() + .nth(1) + .expect("need one arg") + .parse() + .unwrap(); offset_momentum(&mut bodies); println!("{:.9}", energy(&bodies)); diff --git a/library/stdarch/examples/play.rs b/library/stdarch/examples/play.rs index 5cbbdd44d52a..26ce5dd8124c 100644 --- a/library/stdarch/examples/play.rs +++ b/library/stdarch/examples/play.rs @@ -27,8 +27,12 @@ mod example { unsafe { vendor::_mm_cmpestri( - vneedle, needle_len as i32, vhaystack, hay_len as i32, - vendor::_SIDD_CMP_EQUAL_ORDERED) as usize + vneedle, + needle_len as i32, + vhaystack, + hay_len as i32, + vendor::_SIDD_CMP_EQUAL_ORDERED, + ) as usize } } diff --git a/library/stdarch/rustfmt.toml b/library/stdarch/rustfmt.toml new file mode 100644 index 000000000000..91dd4706fd4d --- /dev/null +++ b/library/stdarch/rustfmt.toml @@ -0,0 +1,5 @@ +max_width = 79 +fn_call_width = 79 +wrap_comments = true +error_on_line_overflow = false +fn_args_density = "Compressed" \ No newline at end of file diff --git a/library/stdarch/src/arm/mod.rs b/library/stdarch/src/arm/mod.rs index 08daf23bbdb3..8266842d8254 100644 --- a/library/stdarch/src/arm/mod.rs +++ b/library/stdarch/src/arm/mod.rs @@ -3,7 +3,9 @@ //! The reference for NEON is [ARM's NEON Intrinsics Reference][arm_ref]. The //! [ARM's NEON Intrinsics Online Database][arm_dat] is also useful. //! -//! [arm_ref]: http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf +//! [arm_ref]: +//! http://infocenter.arm.com/help/topic/com.arm.doc. +//! ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf //! [arm_dat]: https://developer.arm.com/technologies/neon/intrinsics pub use self::v6::*; diff --git a/library/stdarch/src/arm/v6.rs b/library/stdarch/src/arm/v6.rs index 63d492d9f187..33fdda67e940 100644 --- a/library/stdarch/src/arm/v6.rs +++ b/library/stdarch/src/arm/v6.rs @@ -1,7 +1,10 @@ //! ARMv6 intrinsics. //! -//! The reference is [ARMv6-M Architecture Reference -//! Manual](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0419c/index.html). +//! The reference is [ARMv6-M Architecture Reference Manual][armv6m]. +//! +//! [armv6m]: +//! http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0419c/index. +//! html #[cfg(test)] use stdsimd_test::assert_instr; @@ -27,16 +30,20 @@ mod tests { #[test] fn _rev_u16() { unsafe { - assert_eq!(v6::_rev_u16(0b0000_0000_1111_1111_u16), 0b1111_1111_0000_0000_u16); + assert_eq!( + v6::_rev_u16(0b0000_0000_1111_1111_u16), + 0b1111_1111_0000_0000_u16 + ); } } #[test] fn _rev_u32() { unsafe { - assert_eq!(v6::_rev_u32( - 0b0000_0000_1111_1111_0000_0000_1111_1111_u32 - ), 0b1111_1111_0000_0000_1111_1111_0000_0000_u32); + assert_eq!( + v6::_rev_u32(0b0000_0000_1111_1111_0000_0000_1111_1111_u32), + 0b1111_1111_0000_0000_1111_1111_0000_0000_u32 + ); } } } diff --git a/library/stdarch/src/arm/v7.rs b/library/stdarch/src/arm/v7.rs index f0143e581d48..b1c66647120e 100644 --- a/library/stdarch/src/arm/v7.rs +++ b/library/stdarch/src/arm/v7.rs @@ -1,7 +1,11 @@ //! ARMv7 intrinsics. //! //! The reference is [ARMv7-M Architecture Reference Manual (Issue -//! E.b)](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0403e.b/index.html). +//! E.b)][armv7m]. +//! +//! [armv7m]: +//! http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0403e. +//! b/index.html pub use super::v6::*; @@ -39,7 +43,7 @@ pub unsafe fn _rbit_u32(x: u32) -> u32 { #[allow(dead_code)] extern "C" { - #[link_name="llvm.bitreverse.i32"] + #[link_name = "llvm.bitreverse.i32"] fn rbit_u32(i: i32) -> i32; } @@ -72,8 +76,10 @@ mod tests { #[test] fn _rbit_u32() { unsafe { - assert_eq!(v7::_rbit_u32(0b0000_1010u32), - 0b0101_0000_0000_0000_0000_0000_0000_0000u32); + assert_eq!( + v7::_rbit_u32(0b0000_1010u32), + 0b0101_0000_0000_0000_0000_0000_0000_0000u32 + ); } } } diff --git a/library/stdarch/src/arm/v7_neon.rs b/library/stdarch/src/arm/v7_neon.rs index 7e82659e97c9..4c4c5f35e261 100644 --- a/library/stdarch/src/arm/v7_neon.rs +++ b/library/stdarch/src/arm/v7_neon.rs @@ -5,10 +5,8 @@ use stdsimd_test::assert_instr; use simd_llvm::simd_add; -use v64::{i8x8, i16x4, i32x2, - u8x8, u16x4, u32x2, f32x2}; -use v128::{i8x16, i16x8, i32x4, i64x2, - u8x16, u16x8, u32x4, u64x2, f32x4}; +use v64::{f32x2, i16x4, i32x2, i8x8, u16x4, u32x2, u8x8}; +use v128::{f32x4, i16x8, i32x4, i64x2, i8x16, u16x8, u32x4, u64x2, u8x16}; /// Vector add. #[inline(always)] @@ -230,18 +228,9 @@ mod tests { #[test] fn vaddq_s8_() { - let a = i8x16::new( - 1, 2, 3, 4, 5, 6, 7, 8, - 1, 2, 3, 4, 5, 6, 7, 8, - ); - let b = i8x16::new( - 8, 7, 6, 5, 4, 3, 2, 1, - 8, 7, 6, 5, 4, 3, 2, 1, - ); - let e = i8x16::new( - 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, - ); + let a = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); + let b = i8x16::new(8, 7, 6, 5, 4, 3, 2, 1, 8, 7, 6, 5, 4, 3, 2, 1); + let e = i8x16::new(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9); let r = unsafe { vaddq_s8(a, b) }; assert_eq!(r, e); } @@ -293,18 +282,9 @@ mod tests { #[test] fn vaddq_u8_() { - let a = u8x16::new( - 1, 2, 3, 4, 5, 6, 7, 8, - 1, 2, 3, 4, 5, 6, 7, 8, - ); - let b = u8x16::new( - 8, 7, 6, 5, 4, 3, 2, 1, - 8, 7, 6, 5, 4, 3, 2, 1, - ); - let e = u8x16::new( - 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, - ); + let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); + let b = u8x16::new(8, 7, 6, 5, 4, 3, 2, 1, 8, 7, 6, 5, 4, 3, 2, 1); + let e = u8x16::new(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9); let r = unsafe { vaddq_u8(a, b) }; assert_eq!(r, e); } @@ -366,15 +346,9 @@ mod tests { #[test] fn vaddl_s8_() { let v = ::std::i8::MAX; - let a = i8x8::new( - v, v, v, v, - v, v, v, v, - ); + let a = i8x8::new(v, v, v, v, v, v, v, v); let v = 2 * (v as i16); - let e = i16x8::new( - v, v, v, v, - v, v, v, v, - ); + let e = i16x8::new(v, v, v, v, v, v, v, v); let r = unsafe { vaddl_s8(a, a) }; assert_eq!(r, e); } @@ -382,13 +356,9 @@ mod tests { #[test] fn vaddl_s16_() { let v = ::std::i16::MAX; - let a = i16x4::new( - v, v, v, v, - ); + let a = i16x4::new(v, v, v, v); let v = 2 * (v as i32); - let e = i32x4::new( - v, v, v, v, - ); + let e = i32x4::new(v, v, v, v); let r = unsafe { vaddl_s16(a, a) }; assert_eq!(r, e); } @@ -396,13 +366,9 @@ mod tests { #[test] fn vaddl_s32_() { let v = ::std::i32::MAX; - let a = i32x2::new( - v, v, - ); + let a = i32x2::new(v, v); let v = 2 * (v as i64); - let e = i64x2::new( - v, v, - ); + let e = i64x2::new(v, v); let r = unsafe { vaddl_s32(a, a) }; assert_eq!(r, e); } @@ -410,15 +376,9 @@ mod tests { #[test] fn vaddl_u8_() { let v = ::std::u8::MAX; - let a = u8x8::new( - v, v, v, v, - v, v, v, v, - ); + let a = u8x8::new(v, v, v, v, v, v, v, v); let v = 2 * (v as u16); - let e = u16x8::new( - v, v, v, v, - v, v, v, v, - ); + let e = u16x8::new(v, v, v, v, v, v, v, v); let r = unsafe { vaddl_u8(a, a) }; assert_eq!(r, e); } @@ -426,13 +386,9 @@ mod tests { #[test] fn vaddl_u16_() { let v = ::std::u16::MAX; - let a = u16x4::new( - v, v, v, v, - ); + let a = u16x4::new(v, v, v, v); let v = 2 * (v as u32); - let e = u32x4::new( - v, v, v, v, - ); + let e = u32x4::new(v, v, v, v); let r = unsafe { vaddl_u16(a, a) }; assert_eq!(r, e); } @@ -440,13 +396,9 @@ mod tests { #[test] fn vaddl_u32_() { let v = ::std::u32::MAX; - let a = u32x2::new( - v, v, - ); + let a = u32x2::new(v, v); let v = 2 * (v as u64); - let e = u64x2::new( - v, v, - ); + let e = u64x2::new(v, v); let r = unsafe { vaddl_u32(a, a) }; assert_eq!(r, e); } diff --git a/library/stdarch/src/arm/v8.rs b/library/stdarch/src/arm/v8.rs index 53815e7a628d..cf623fd9fc36 100644 --- a/library/stdarch/src/arm/v8.rs +++ b/library/stdarch/src/arm/v8.rs @@ -1,6 +1,9 @@ //! ARMv8 intrinsics. //! -//! The reference is [ARMv8-A Reference Manual](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0487a.k_10775/index.html). +//! The reference is [ARMv8-A Reference Manual][armv8]. +//! +//! [armv8]: http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc. +//! ddi0487a.k_10775/index.html pub use super::v7::*; @@ -23,7 +26,7 @@ pub unsafe fn _clz_u64(x: u64) -> u64 { #[allow(dead_code)] extern "C" { - #[link_name="llvm.bitreverse.i64"] + #[link_name = "llvm.bitreverse.i64"] fn rbit_u64(i: i64) -> i64; } @@ -61,9 +64,10 @@ mod tests { #[test] fn _rev_u64() { unsafe { - assert_eq!(v8::_rev_u64( - 0b0000_0000_1111_1111_0000_0000_1111_1111_u64 - ), 0b1111_1111_0000_0000_1111_1111_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_u64); + assert_eq!( + v8::_rev_u64(0b0000_0000_1111_1111_0000_0000_1111_1111_u64), + 0b1111_1111_0000_0000_1111_1111_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_u64 + ); } } @@ -77,27 +81,32 @@ mod tests { #[test] fn _rbit_u64() { unsafe { - assert_eq!(v8::_rbit_u64( - 0b0000_0000_1111_1101_0000_0000_1111_1111_u64 - ), 0b1111_1111_0000_0000_1011_1111_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_u64); + assert_eq!( + v8::_rbit_u64(0b0000_0000_1111_1101_0000_0000_1111_1111_u64), + 0b1111_1111_0000_0000_1011_1111_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_u64 + ); } } #[test] fn _cls_u32() { unsafe { - assert_eq!(v8::_cls_u32( - 0b1111_1111_1111_1111_0000_0000_1111_1111_u32 - ), 15_u32); + assert_eq!( + v8::_cls_u32(0b1111_1111_1111_1111_0000_0000_1111_1111_u32), + 15_u32 + ); } } #[test] fn _cls_u64() { unsafe { - assert_eq!(v8::_cls_u64( - 0b1111_1111_1111_1111_0000_0000_1111_1111_0000_0000_0000_0000_0000_0000_0000_0000_u64 - ), 15_u64); + assert_eq!( + v8::_cls_u64( + 0b1111_1111_1111_1111_0000_0000_1111_1111_0000_0000_0000_0000_0000_0000_0000_0000_u64 + ), + 15_u64 + ); } } } diff --git a/library/stdarch/src/lib.rs b/library/stdarch/src/lib.rs index e30be8c53d53..849acba395ad 100644 --- a/library/stdarch/src/lib.rs +++ b/library/stdarch/src/lib.rs @@ -44,9 +44,9 @@ //! have no runtime support for whether you CPU actually supports the //! instruction. //! -//! CPU target feature detection is done via the `cfg_feature_enabled!` macro at -//! runtime. This macro will detect at runtime whether the specified feature is -//! available or not, returning true or false depending on the current CPU. +//! CPU target feature detection is done via the `cfg_feature_enabled!` macro +//! at runtime. This macro will detect at runtime whether the specified feature +//! is available or not, returning true or false depending on the current CPU. //! //! ``` //! #![feature(cfg_target_feature)] @@ -58,7 +58,8 @@ //! if cfg_feature_enabled!("avx2") { //! println!("avx2 intrinsics will work"); //! } else { -//! println!("avx2 intrinsics will not work, they may generate SIGILL"); +//! println!("avx2 intrinsics will not work"); +//! // undefined behavior: may generate a `SIGILL`. //! } //! } //! ``` @@ -93,29 +94,33 @@ //! //! # Status //! -//! This crate is intended for eventual inclusion into the standard library, but -//! some work and experimentation is needed to get there! First and foremost you -//! can help out by kicking the tires on this crate and seeing if it works for -//! your use case! Next up you can help us fill out the [vendor -//! intrinsics][vendor] to ensure that we've got all the SIMD support necessary. +//! This crate is intended for eventual inclusion into the standard library, +//! but some work and experimentation is needed to get there! First and +//! foremost you can help out by kicking the tires on this crate and seeing if +//! it works for your use case! Next up you can help us fill out the [vendor +//! intrinsics][vendor] to ensure that we've got all the SIMD support +//! necessary. //! -//! The language support and status of SIMD is also still a little up in the air -//! right now, you may be interested in a few issues along these lines: +//! The language support and status of SIMD is also still a little up in the +//! air right now, you may be interested in a few issues along these lines: //! -//! * [Overal tracking issue for SIMD support](https://github.com/rust-lang/rust/issues/27731) -//! * [`cfg_target_feature` tracking issue](https://github.com/rust-lang/rust/issues/29717) -//! * [SIMD types currently not sound](https://github.com/rust-lang/rust/issues/44367) -//! * [`#[target_feature]` improvements](https://github.com/rust-lang/rust/issues/44839) +//! * [Overal tracking issue for SIMD support] +//! (https://github.com/rust-lang/rust/issues/27731) +//! * [`cfg_target_feature` tracking issue] +//! (https://github.com/rust-lang/rust/issues/29717) +//! * [SIMD types currently not sound] +//! (https://github.com/rust-lang/rust/issues/44367) +//! * [`#[target_feature]` improvements] +//! (https://github.com/rust-lang/rust/issues/44839) //! //! [vendor]: https://github.com/rust-lang-nursery/stdsimd/issues/40 #![cfg_attr(feature = "strict", deny(warnings))] #![allow(dead_code)] #![allow(unused_features)] -#![feature( - const_fn, link_llvm_intrinsics, platform_intrinsics, repr_simd, simd_ffi, - target_feature, cfg_target_feature, i128_type, asm, const_atomic_usize_new -)] +#![feature(const_fn, link_llvm_intrinsics, platform_intrinsics, repr_simd, + simd_ffi, target_feature, cfg_target_feature, i128_type, asm, + const_atomic_usize_new, stmt_expr_attributes)] #![cfg_attr(test, feature(proc_macro, test))] #[cfg(test)] diff --git a/library/stdarch/src/macros.rs b/library/stdarch/src/macros.rs index 43b6a9b2f854..ab287a80000b 100644 --- a/library/stdarch/src/macros.rs +++ b/library/stdarch/src/macros.rs @@ -240,9 +240,11 @@ macro_rules! define_integer_ops { i8, i16, i32, i64, isize); impl ::std::fmt::LowerHex for $ty { - fn fmt(&self, f: &mut ::std::fmt::Formatter) -> ::std::fmt::Result { + fn fmt(&self, f: &mut ::std::fmt::Formatter) + -> ::std::fmt::Result { write!(f, "{}(", stringify!($ty))?; - let n = ::std::mem::size_of_val(self) / ::std::mem::size_of::<$elem>(); + let n = ::std::mem::size_of_val(self) + / ::std::mem::size_of::<$elem>(); for i in 0..n { if i > 0 { write!(f, ", ")?; @@ -292,8 +294,7 @@ macro_rules! cfg_feature_enabled { /// On ARM features are only detected at compile-time using /// cfg(target_feature), so if this macro is executed the /// feature is not supported. -#[cfg(any(target_arch = "arm", - target_arch = "aarch64"))] +#[cfg(any(target_arch = "arm", target_arch = "aarch64"))] #[macro_export] #[doc(hidden)] macro_rules! __unstable_detect_feature { @@ -302,10 +303,8 @@ macro_rules! __unstable_detect_feature { } /// In all unsupported architectures using the macro is an error -#[cfg(not(any(target_arch = "x86", - target_arch = "x86_64", - target_arch = "arm", - target_arch = "aarch64")))] +#[cfg(not(any(target_arch = "x86", target_arch = "x86_64", + target_arch = "arm", target_arch = "aarch64")))] #[macro_export] #[doc(hidden)] macro_rules! __unstable_detect_feature { diff --git a/library/stdarch/src/v128.rs b/library/stdarch/src/v128.rs index 922f983a59ad..f5c425bc745d 100644 --- a/library/stdarch/src/v128.rs +++ b/library/stdarch/src/v128.rs @@ -50,7 +50,17 @@ define_from!(u8x16, u64x2, i64x2, u32x4, i32x4, u16x8, i16x8, i8x16); define_from!(i8x16, u64x2, i64x2, u32x4, i32x4, u16x8, i16x8, u8x16); define_common_ops!( - f64x2, f32x4, u64x2, i64x2, u32x4, i32x4, u16x8, i16x8, u8x16, i8x16); + f64x2, + f32x4, + u64x2, + i64x2, + u32x4, + i32x4, + u16x8, + i16x8, + u8x16, + i8x16 +); define_float_ops!(f64x2, f32x4); define_integer_ops!( (u64x2, u64), @@ -60,7 +70,8 @@ define_integer_ops!( (u16x8, u16), (i16x8, i16), (u8x16, u8), - (i8x16, i8)); + (i8x16, i8) +); define_casts!( (f64x2, f32x2, as_f32x2), (f64x2, u64x2, as_u64x2), @@ -79,4 +90,5 @@ define_casts!( (u16x8, i16x8, as_i16x8), (i16x8, u16x8, as_u16x8), (u8x16, i8x16, as_i8x16), - (i8x16, u8x16, as_u8x16)); + (i8x16, u8x16, as_u8x16) +); diff --git a/library/stdarch/src/v256.rs b/library/stdarch/src/v256.rs index a5e163e45319..33d2584f7127 100644 --- a/library/stdarch/src/v256.rs +++ b/library/stdarch/src/v256.rs @@ -74,7 +74,17 @@ define_from!(u8x32, u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, i8x32); define_from!(i8x32, u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, u8x32); define_common_ops!( - f64x4, f32x8, u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, u8x32, i8x32); + f64x4, + f32x8, + u64x4, + i64x4, + u32x8, + i32x8, + u16x16, + i16x16, + u8x32, + i8x32 +); define_float_ops!(f64x4, f32x8); define_integer_ops!( (u64x4, u64), @@ -84,7 +94,8 @@ define_integer_ops!( (u16x16, u16), (i16x16, i16), (u8x32, u8), - (i8x32, i8)); + (i8x32, i8) +); define_casts!( (f64x4, f32x4, as_f32x4), (f64x4, u64x4, as_u64x4), @@ -102,4 +113,5 @@ define_casts!( (u16x16, i16x16, as_i16x16), (i16x16, u16x16, as_u16x16), (u8x32, i8x32, as_i8x32), - (i8x32, u8x32, as_u8x32)); + (i8x32, u8x32, as_u8x32) +); diff --git a/library/stdarch/src/v512.rs b/library/stdarch/src/v512.rs index 5b1afee639b9..4973a7001ed6 100644 --- a/library/stdarch/src/v512.rs +++ b/library/stdarch/src/v512.rs @@ -120,7 +120,17 @@ define_from!(u8x64, u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, i8x64); define_from!(i8x64, u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, u8x64); define_common_ops!( - f64x8, f32x16, u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, u8x64, i8x64); + f64x8, + f32x16, + u64x8, + i64x8, + u32x16, + i32x16, + u16x32, + i16x32, + u8x64, + i8x64 +); define_float_ops!(f64x8, f32x16); define_integer_ops!( (u64x8, u64), @@ -130,7 +140,8 @@ define_integer_ops!( (u16x32, u16), (i16x32, i16), (u8x64, u8), - (i8x64, i8)); + (i8x64, i8) +); define_casts!( (f64x8, f32x8, as_f32x8), (f64x8, u64x8, as_u64x8), @@ -148,5 +159,5 @@ define_casts!( (u16x32, i16x32, as_i16x32), (i16x32, u16x32, as_u16x32), (u8x64, i8x64, as_i8x64), - (i8x64, u8x64, as_u8x64)); - + (i8x64, u8x64, as_u8x64) +); diff --git a/library/stdarch/src/v64.rs b/library/stdarch/src/v64.rs index a889f037a744..fe7f59c2fde1 100644 --- a/library/stdarch/src/v64.rs +++ b/library/stdarch/src/v64.rs @@ -42,7 +42,8 @@ define_integer_ops!( (u16x4, u16), (i16x4, i16), (u8x8, u8), - (i8x8, i8)); + (i8x8, i8) +); define_casts!( (f32x2, f64x2, as_f64x2), (f32x2, u32x2, as_u32x2), @@ -61,5 +62,4 @@ define_casts!( (u8x8, u16x8, as_u16x8), (u16x4, u32x4, as_u32x4), (u32x2, u64x2, as_u64x2) - ); diff --git a/library/stdarch/src/x86/abm.rs b/library/stdarch/src/x86/abm.rs index 4b9b49e81595..9f2fa9811b1f 100644 --- a/library/stdarch/src/x86/abm.rs +++ b/library/stdarch/src/x86/abm.rs @@ -4,11 +4,19 @@ //! //! The references are: //! -//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2: Instruction Set Reference, A-Z](http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf). -//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and System Instructions](http://support.amd.com/TechDocs/24594.pdf). +//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2: +//! Instruction Set Reference, A-Z][intel64_ref]. +//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and +//! System Instructions][amd64_ref]. //! -//! [Wikipedia](https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29) -//! provides a quick overview of the instructions available. +//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions +//! available. +//! +//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf +//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf +//! [wikipedia_bmi]: +//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_. +//! 28Advanced_Bit_Manipulation.29 #[cfg(test)] use stdsimd_test::assert_instr; @@ -19,7 +27,9 @@ use stdsimd_test::assert_instr; #[inline(always)] #[target_feature = "+lzcnt"] #[cfg_attr(test, assert_instr(lzcnt))] -pub unsafe fn _lzcnt_u32(x: u32) -> u32 { x.leading_zeros() } +pub unsafe fn _lzcnt_u32(x: u32) -> u32 { + x.leading_zeros() +} /// Counts the leading most significant zero bits. /// @@ -27,19 +37,25 @@ pub unsafe fn _lzcnt_u32(x: u32) -> u32 { x.leading_zeros() } #[inline(always)] #[target_feature = "+lzcnt"] #[cfg_attr(test, assert_instr(lzcnt))] -pub unsafe fn _lzcnt_u64(x: u64) -> u64 { x.leading_zeros() as u64 } +pub unsafe fn _lzcnt_u64(x: u64) -> u64 { + x.leading_zeros() as u64 +} /// Counts the bits that are set. #[inline(always)] #[target_feature = "+popcnt"] #[cfg_attr(test, assert_instr(popcnt))] -pub unsafe fn _popcnt32(x: u32) -> u32 { x.count_ones() } +pub unsafe fn _popcnt32(x: u32) -> u32 { + x.count_ones() +} /// Counts the bits that are set. #[inline(always)] #[target_feature = "+popcnt"] #[cfg_attr(test, assert_instr(popcnt))] -pub unsafe fn _popcnt64(x: u64) -> u64 { x.count_ones() as u64 } +pub unsafe fn _popcnt64(x: u64) -> u64 { + x.count_ones() as u64 +} #[cfg(test)] mod tests { diff --git a/library/stdarch/src/x86/avx.rs b/library/stdarch/src/x86/avx.rs index 7e3e0f9dcf60..858935208e4a 100644 --- a/library/stdarch/src/x86/avx.rs +++ b/library/stdarch/src/x86/avx.rs @@ -18,7 +18,8 @@ pub unsafe fn _mm256_add_pd(a: f64x4, b: f64x4) -> f64x4 { a + b } -/// Add packed single-precision (32-bit) floating-point elements in `a` and `b`. +/// Add packed single-precision (32-bit) floating-point elements in `a` and +/// `b`. #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vaddps))] @@ -26,7 +27,8 @@ pub unsafe fn _mm256_add_ps(a: f32x8, b: f32x8) -> f32x8 { a + b } -/// Compute the bitwise AND of a packed double-precision (64-bit) floating-point elements +/// Compute the bitwise AND of a packed double-precision (64-bit) +/// floating-point elements /// in `a` and `b`. #[inline(always)] #[target_feature = "+avx"] @@ -39,7 +41,8 @@ pub unsafe fn _mm256_and_pd(a: f64x4, b: f64x4) -> f64x4 { mem::transmute(a & b) } -/// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`. +/// Compute the bitwise AND of packed single-precision (32-bit) floating-point +/// elements in `a` and `b`. #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vandps))] @@ -49,7 +52,8 @@ pub unsafe fn _mm256_and_ps(a: f32x8, b: f32x8) -> f32x8 { mem::transmute(a & b) } -/// Compute the bitwise OR packed double-precision (64-bit) floating-point elements +/// Compute the bitwise OR packed double-precision (64-bit) floating-point +/// elements /// in `a` and `b`. #[inline(always)] #[target_feature = "+avx"] @@ -62,7 +66,8 @@ pub unsafe fn _mm256_or_pd(a: f64x4, b: f64x4) -> f64x4 { mem::transmute(a | b) } -/// Compute the bitwise OR packed single-precision (32-bit) floating-point elements in `a` and `b`. +/// Compute the bitwise OR packed single-precision (32-bit) floating-point +/// elements in `a` and `b`. #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vorps))] @@ -114,7 +119,8 @@ pub unsafe fn _mm256_shuffle_pd(a: f64x4, b: f64x4, imm8: i32) -> f64x4 { } } -/// Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in `a` +/// Compute the bitwise NOT of packed double-precision (64-bit) floating-point +/// elements in `a` /// and then AND with `b`. #[inline(always)] #[target_feature = "+avx"] @@ -126,7 +132,8 @@ pub unsafe fn _mm256_andnot_pd(a: f64x4, b: f64x4) -> f64x4 { mem::transmute((!a) & b) } -/// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a` +/// Compute the bitwise NOT of packed single-precision (32-bit) floating-point +/// elements in `a` /// and then AND with `b`. #[inline(always)] #[target_feature = "+avx"] @@ -146,8 +153,8 @@ pub unsafe fn _mm256_max_pd(a: f64x4, b: f64x4) -> f64x4 { maxpd256(a, b) } -/// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, -/// and return packed maximum values +/// Compare packed single-precision (32-bit) floating-point elements in `a` +/// and `b`, and return packed maximum values #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vmaxps))] @@ -164,8 +171,8 @@ pub unsafe fn _mm256_min_pd(a: f64x4, b: f64x4) -> f64x4 { minpd256(a, b) } -/// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, -/// and return packed minimum values +/// Compare packed single-precision (32-bit) floating-point elements in `a` +/// and `b`, and return packed minimum values #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vminps))] @@ -182,7 +189,8 @@ pub unsafe fn _mm256_mul_pd(a: f64x4, b: f64x4) -> f64x4 { a * b } -/// Add packed single-precision (32-bit) floating-point elements in `a` and `b`. +/// Add packed single-precision (32-bit) floating-point elements in `a` and +/// `b`. #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vmulps))] @@ -266,8 +274,8 @@ pub unsafe fn _mm256_round_pd(a: f64x4, b: i32) -> f64x4 { constify_imm8!(b, call) } -/// Round packed double-precision (64-bit) floating point elements in `a` toward -/// positive infinity. +/// Round packed double-precision (64-bit) floating point elements in `a` +/// toward positive infinity. #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vroundpd))] @@ -275,8 +283,8 @@ pub unsafe fn _mm256_ceil_pd(a: f64x4) -> f64x4 { roundpd256(a, 0x02) } -/// Round packed double-precision (64-bit) floating point elements in `a` toward -/// negative infinity. +/// Round packed double-precision (64-bit) floating point elements in `a` +/// toward negative infinity. #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vroundpd))] @@ -292,9 +300,9 @@ pub unsafe fn _mm256_floor_pd(a: f64x4) -> f64x4 { /// - `0x02`: Round up, toward positive infinity. /// - `0x03`: Truncate the values. /// -/// For a complete list of options, check the LLVM docs: +/// For a complete list of options, check [the LLVM docs][llvm_docs]. /// -/// https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382 +/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382 #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vroundps, b = 0x00))] @@ -307,8 +315,8 @@ pub unsafe fn _mm256_round_ps(a: f32x8, b: i32) -> f32x8 { constify_imm8!(b, call) } -/// Round packed single-precision (32-bit) floating point elements in `a` toward -/// positive infinity. +/// Round packed single-precision (32-bit) floating point elements in `a` +/// toward positive infinity. #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vroundps))] @@ -316,8 +324,8 @@ pub unsafe fn _mm256_ceil_ps(a: f32x8) -> f32x8 { roundps256(a, 0x02) } -/// Round packed single-precision (32-bit) floating point elements in `a` toward -/// negative infinity. +/// Round packed single-precision (32-bit) floating point elements in `a` +/// toward negative infinity. #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vroundps))] @@ -606,7 +614,8 @@ pub unsafe fn _mm256_cmp_ps(a: f32x8, b: f32x8, imm8: u8) -> f32x8 { /// Compare the lower double-precision (64-bit) floating-point element in /// `a` and `b` based on the comparison operand specified by `imm8`, /// store the result in the lower element of returned vector, -/// and copy the upper element from `a` to the upper element of returned vector. +/// and copy the upper element from `a` to the upper element of returned +/// vector. #[inline(always)] #[target_feature = "+avx,+sse2"] #[cfg_attr(test, assert_instr(vcmpeqsd, imm8 = 0))] // TODO Validate vcmpsd @@ -811,7 +820,9 @@ pub unsafe fn _mm_permutevar_ps(a: f32x4, b: i32x4) -> f32x4 { #[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))] pub unsafe fn _mm256_permute_ps(a: f32x8, imm8: i32) -> f32x8 { let imm8 = (imm8 & 0xFF) as u8; - const fn add4(x: u32) -> u32 { x + 4 } + const fn add4(x: u32) -> u32 { + x + 4 + } macro_rules! shuffle4 { ($a:expr, $b:expr, $c:expr, $d:expr) => { simd_shuffle8(a, _mm256_undefined_ps(), [ @@ -857,7 +868,7 @@ pub unsafe fn _mm256_permute_ps(a: f32x8, imm8: i32) -> f32x8 { } } -/// Shuffle single-precision (32-bit) floating-point elements in `a` +/// Shuffle single-precision (32-bit) floating-point elements in `a` /// using the control in `imm8`. #[inline(always)] #[target_feature = "+avx,+sse"] @@ -1026,7 +1037,9 @@ pub unsafe fn _mm256_permute2f128_pd(a: f64x4, b: f64x4, imm8: i8) -> f64x4 { #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))] -pub unsafe fn _mm256_permute2f128_si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8 { +pub unsafe fn _mm256_permute2f128_si256( + a: i32x8, b: i32x8, imm8: i8 +) -> i32x8 { macro_rules! call { ($imm8:expr) => { vperm2f128si256(a, b, $imm8) } } @@ -1110,7 +1123,9 @@ pub unsafe fn _mm256_insertf128_pd(a: f64x4, b: f64x2, imm8: i32) -> f64x4 { #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))] -pub unsafe fn _mm256_insertf128_si256(a: __m256i, b: __m128i, imm8: i32) -> __m256i { +pub unsafe fn _mm256_insertf128_si256( + a: __m256i, b: __m128i, imm8: i32 +) -> __m256i { let b = i64x4::from(_mm256_castsi128_si256(b)); let dst: i64x4 = match imm8 & 1 { 0 => simd_shuffle4(i64x4::from(a), b, [4, 5, 2, 3]), @@ -1166,7 +1181,8 @@ pub unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> f64x4 { ptr::copy_nonoverlapping( mem_addr as *const u8, &mut dst as *mut f64x4 as *mut u8, - mem::size_of::()); + mem::size_of::(), + ); dst } @@ -1191,7 +1207,8 @@ pub unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> f32x8 { ptr::copy_nonoverlapping( mem_addr as *const u8, &mut dst as *mut f32x8 as *mut u8, - mem::size_of::()); + mem::size_of::(), + ); dst } @@ -1215,12 +1232,13 @@ pub unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i { ptr::copy_nonoverlapping( mem_addr as *const u8, &mut dst as *mut __m256i as *mut u8, - mem::size_of::<__m256i>()); + mem::size_of::<__m256i>(), + ); dst } /// Store 256-bits of integer data from `a` into memory. -/// `mem_addr` does not need to be aligned on any particular boundary. +/// `mem_addr` does not need to be aligned on any particular boundary. #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected @@ -1234,7 +1252,7 @@ pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) { #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vmaskmovpd))] -pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: i64x4) -> f64x4 { +pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: i64x4) -> f64x4 { maskloadpd256(mem_addr as *const i8, mask) } @@ -1272,7 +1290,7 @@ pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: i64x2, a: f64x2) { #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vmaskmovps))] -pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: i32x8) -> f32x8 { +pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: i32x8) -> f32x8 { maskloadps256(mem_addr as *const i8, mask) } @@ -1592,7 +1610,8 @@ pub unsafe fn _mm_testnzc_ps(a: f32x4, b: f32x4) -> i32 { } /// Set each bit of the returned mask based on the most significant bit of the -/// corresponding packed double-precision (64-bit) floating-point element in `a`. +/// corresponding packed double-precision (64-bit) floating-point element in +/// `a`. #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vmovmskpd))] @@ -1601,7 +1620,8 @@ pub unsafe fn _mm256_movemask_pd(a: f64x4) -> i32 { } /// Set each bit of the returned mask based on the most significant bit of the -/// corresponding packed single-precision (32-bit) floating-point element in `a`. +/// corresponding packed single-precision (32-bit) floating-point element in +/// `a`. #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vmovmskps))] @@ -1646,8 +1666,9 @@ pub unsafe fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> f64x4 { /// vector with the supplied values. #[inline(always)] #[target_feature = "+avx"] -pub unsafe fn _mm256_set_ps(a: f32, b: f32, c: f32, d: f32, - e: f32, f: f32, g: f32, h: f32) -> f32x8 { +pub unsafe fn _mm256_set_ps( + a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32 +) -> f32x8 { f32x8::new(h, g, f, e, d, c, b, a) } @@ -1655,44 +1676,45 @@ pub unsafe fn _mm256_set_ps(a: f32, b: f32, c: f32, d: f32, /// reverse order. #[inline(always)] #[target_feature = "+avx"] -pub unsafe fn _mm256_set_epi8(e00: i8, e01: i8, e02: i8, e03: i8, - e04: i8, e05: i8, e06: i8, e07: i8, - e08: i8, e09: i8, e10: i8, e11: i8, - e12: i8, e13: i8, e14: i8, e15: i8, - e16: i8, e17: i8, e18: i8, e19: i8, - e20: i8, e21: i8, e22: i8, e23: i8, - e24: i8, e25: i8, e26: i8, e27: i8, - e28: i8, e29: i8, e30: i8, e31: i8) -> i8x32 { - i8x32::new(e31, e30, e29, e28, - e27, e26, e25, e24, - e23, e22, e21, e20, - e19, e18, e17, e16, - e15, e14, e13, e12, - e11, e10, e09, e08, - e07, e06, e05, e04, - e03, e02, e01, e00) +pub unsafe fn _mm256_set_epi8( + e00: i8, e01: i8, e02: i8, e03: i8, e04: i8, e05: i8, e06: i8, e07: i8, + e08: i8, e09: i8, e10: i8, e11: i8, e12: i8, e13: i8, e14: i8, e15: i8, + e16: i8, e17: i8, e18: i8, e19: i8, e20: i8, e21: i8, e22: i8, e23: i8, + e24: i8, e25: i8, e26: i8, e27: i8, e28: i8, e29: i8, e30: i8, e31: i8, +) -> i8x32 { + #[cfg_attr(rustfmt, rustfmt_skip)] + i8x32::new( + e31, e30, e29, e28, e27, e26, e25, e24, + e23, e22, e21, e20, e19, e18, e17, e16, + e15, e14, e13, e12, e11, e10, e09, e08, + e07, e06, e05, e04, e03, e02, e01, e00, + ) } /// Set packed 16-bit integers in returned vector with the supplied values. #[inline(always)] #[target_feature = "+avx"] -pub unsafe fn _mm256_set_epi16(e00: i16, e01: i16, e02: i16, e03: i16, - e04: i16, e05: i16, e06: i16, e07: i16, - e08: i16, e09: i16, e10: i16, e11: i16, - e12: i16, e13: i16, e14: i16, e15: i16) -> i16x16 { - i16x16::new(e15, e14, e13, e12, - e11, e10, e09, e08, - e07, e06, e05, e04, - e03, e02, e01, e00) +pub unsafe fn _mm256_set_epi16( + e00: i16, e01: i16, e02: i16, e03: i16, e04: i16, e05: i16, e06: i16, + e07: i16, e08: i16, e09: i16, e10: i16, e11: i16, e12: i16, e13: i16, + e14: i16, e15: i16, +) -> i16x16 { + #[cfg_attr(rustfmt, rustfmt_skip)] + i16x16::new( + e15, e14, e13, e12, + e11, e10, e09, e08, + e07, e06, e05, e04, + e03, e02, e01, e00, + ) } /// Set packed 32-bit integers in returned vector with the supplied values. #[inline(always)] #[target_feature = "+avx"] -pub unsafe fn _mm256_set_epi32(e0: i32, e1: i32, e2: i32, e3: i32, - e4: i32, e5: i32, e6: i32, e7: i32) -> i32x8 { - i32x8::new(e7, e6, e5, e4, - e3, e2, e1, e0) +pub unsafe fn _mm256_set_epi32( + e0: i32, e1: i32, e2: i32, e3: i32, e4: i32, e5: i32, e6: i32, e7: i32 +) -> i32x8 { + i32x8::new(e7, e6, e5, e4, e3, e2, e1, e0) } /// Set packed 64-bit integers in returned vector with the supplied values. @@ -1715,8 +1737,9 @@ pub unsafe fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> f64x4 { /// vector with the supplied values in reverse order. #[inline(always)] #[target_feature = "+avx"] -pub unsafe fn _mm256_setr_ps(a: f32, b: f32, c: f32, d: f32, - e: f32, f: f32, g: f32, h: f32) -> f32x8 { +pub unsafe fn _mm256_setr_ps( + a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32 +) -> f32x8 { f32x8::new(a, b, c, d, e, f, g, h) } @@ -1724,46 +1747,47 @@ pub unsafe fn _mm256_setr_ps(a: f32, b: f32, c: f32, d: f32, /// reverse order. #[inline(always)] #[target_feature = "+avx"] -pub unsafe fn _mm256_setr_epi8(e00: i8, e01: i8, e02: i8, e03: i8, - e04: i8, e05: i8, e06: i8, e07: i8, - e08: i8, e09: i8, e10: i8, e11: i8, - e12: i8, e13: i8, e14: i8, e15: i8, - e16: i8, e17: i8, e18: i8, e19: i8, - e20: i8, e21: i8, e22: i8, e23: i8, - e24: i8, e25: i8, e26: i8, e27: i8, - e28: i8, e29: i8, e30: i8, e31: i8) -> i8x32 { - i8x32::new(e00, e01, e02, e03, - e04, e05, e06, e07, - e08, e09, e10, e11, - e12, e13, e14, e15, - e16, e17, e18, e19, - e20, e21, e22, e23, - e24, e25, e26, e27, - e28, e29, e30, e31) +pub unsafe fn _mm256_setr_epi8( + e00: i8, e01: i8, e02: i8, e03: i8, e04: i8, e05: i8, e06: i8, e07: i8, + e08: i8, e09: i8, e10: i8, e11: i8, e12: i8, e13: i8, e14: i8, e15: i8, + e16: i8, e17: i8, e18: i8, e19: i8, e20: i8, e21: i8, e22: i8, e23: i8, + e24: i8, e25: i8, e26: i8, e27: i8, e28: i8, e29: i8, e30: i8, e31: i8, +) -> i8x32 { + #[cfg_attr(rustfmt, rustfmt_skip)] + i8x32::new( + e00, e01, e02, e03, e04, e05, e06, e07, + e08, e09, e10, e11, e12, e13, e14, e15, + e16, e17, e18, e19, e20, e21, e22, e23, + e24, e25, e26, e27, e28, e29, e30, e31, + ) } /// Set packed 16-bit integers in returned vector with the supplied values in /// reverse order. #[inline(always)] #[target_feature = "+avx"] -pub unsafe fn _mm256_setr_epi16(e00: i16, e01: i16, e02: i16, e03: i16, - e04: i16, e05: i16, e06: i16, e07: i16, - e08: i16, e09: i16, e10: i16, e11: i16, - e12: i16, e13: i16, e14: i16, e15: i16) -> i16x16 { - i16x16::new(e00, e01, e02, e03, - e04, e05, e06, e07, - e08, e09, e10, e11, - e12, e13, e14, e15) +pub unsafe fn _mm256_setr_epi16( + e00: i16, e01: i16, e02: i16, e03: i16, e04: i16, e05: i16, e06: i16, + e07: i16, e08: i16, e09: i16, e10: i16, e11: i16, e12: i16, e13: i16, + e14: i16, e15: i16, +) -> i16x16 { + #[cfg_attr(rustfmt, rustfmt_skip)] + i16x16::new( + e00, e01, e02, e03, + e04, e05, e06, e07, + e08, e09, e10, e11, + e12, e13, e14, e15, + ) } /// Set packed 32-bit integers in returned vector with the supplied values in /// reverse order. #[inline(always)] #[target_feature = "+avx"] -pub unsafe fn _mm256_setr_epi32(e0: i32, e1: i32, e2: i32, e3: i32, - e4: i32, e5: i32, e6: i32, e7: i32) -> i32x8 { - i32x8::new(e0, e1, e2, e3, - e4, e5, e6, e7) +pub unsafe fn _mm256_setr_epi32( + e0: i32, e1: i32, e2: i32, e3: i32, e4: i32, e5: i32, e6: i32, e7: i32 +) -> i32x8 { + i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7) } /// Set packed 64-bit integers in returned vector with the supplied values in @@ -1798,10 +1822,13 @@ pub unsafe fn _mm256_set1_ps(a: f32) -> f32x8 { #[cfg_attr(test, assert_instr(vpshufb))] #[cfg_attr(test, assert_instr(vinsertf128))] pub unsafe fn _mm256_set1_epi8(a: i8) -> i8x32 { - i8x32::new(a, a, a, a, a, a, a, a, - a, a, a, a, a, a, a, a, - a, a, a, a, a, a, a, a, - a, a, a, a, a, a, a, a) + #[cfg_attr(rustfmt, rustfmt_skip)] + i8x32::new( + a, a, a, a, a, a, a, a, + a, a, a, a, a, a, a, a, + a, a, a, a, a, a, a, a, + a, a, a, a, a, a, a, a, + ) } /// Broadcast 16-bit integer `a` to all all elements of returned vector. @@ -1811,8 +1838,7 @@ pub unsafe fn _mm256_set1_epi8(a: i8) -> i8x32 { //#[cfg_attr(test, assert_instr(vpshufb))] #[cfg_attr(test, assert_instr(vinsertf128))] pub unsafe fn _mm256_set1_epi16(a: i16) -> i16x16 { - i16x16::new(a, a, a, a, a, a, a, a, - a, a, a, a, a, a, a, a) + i16x16::new(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a) } /// Broadcast 32-bit integer `a` to all elements of returned vector. @@ -1954,7 +1980,7 @@ pub unsafe fn _mm256_zextps128_ps256(a: f32x4) -> f32x8 { #[target_feature = "+avx,+sse2"] pub unsafe fn _mm256_zextsi128_si256(a: __m128i) -> __m256i { use x86::sse2::_mm_setzero_si128; - let b = mem::transmute(_mm_setzero_si128()); + let b = mem::transmute(_mm_setzero_si128()); let dst: i64x4 = simd_shuffle4(i64x2::from(a), b, [0, 1, 2, 3]); __m256i::from(dst) } @@ -2044,22 +2070,28 @@ pub unsafe fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i { } /// Load two 128-bit values (composed of 4 packed single-precision (32-bit) -/// floating-point elements) from memory, and combine them into a 256-bit value. +/// floating-point elements) from memory, and combine them into a 256-bit +/// value. /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. #[inline(always)] #[target_feature = "+avx,+sse"] -pub unsafe fn _mm256_loadu2_m128(hiaddr: *const f32, loaddr: *const f32) -> f32x8 { +pub unsafe fn _mm256_loadu2_m128( + hiaddr: *const f32, loaddr: *const f32 +) -> f32x8 { use x86::sse::_mm_loadu_ps; let a = _mm256_castps128_ps256(_mm_loadu_ps(loaddr)); _mm256_insertf128_ps(a, _mm_loadu_ps(hiaddr), 1) } /// Load two 128-bit values (composed of 2 packed double-precision (64-bit) -/// floating-point elements) from memory, and combine them into a 256-bit value. +/// floating-point elements) from memory, and combine them into a 256-bit +/// value. /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. #[inline(always)] #[target_feature = "+avx,+sse2"] -pub unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> f64x4 { +pub unsafe fn _mm256_loadu2_m128d( + hiaddr: *const f64, loaddr: *const f64 +) -> f64x4 { use x86::sse2::_mm_loadu_pd; let a = _mm256_castpd128_pd256(_mm_loadu_pd(loaddr)); _mm256_insertf128_pd(a, _mm_loadu_pd(hiaddr), 1) @@ -2070,7 +2102,9 @@ pub unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> f64 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. #[inline(always)] #[target_feature = "+avx,+sse2"] -pub unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i) -> __m256i { +pub unsafe fn _mm256_loadu2_m128i( + hiaddr: *const __m128i, loaddr: *const __m128i +) -> __m256i { use x86::sse2::_mm_loadu_si128; let a = _mm256_castsi128_si256(_mm_loadu_si128(loaddr)); _mm256_insertf128_si256(a, _mm_loadu_si128(hiaddr), 1) @@ -2082,7 +2116,9 @@ pub unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. #[inline(always)] #[target_feature = "+avx,+sse"] -pub unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: f32x8) { +pub unsafe fn _mm256_storeu2_m128( + hiaddr: *mut f32, loaddr: *mut f32, a: f32x8 +) { use x86::sse::_mm_storeu_ps; let lo = _mm256_castps256_ps128(a); _mm_storeu_ps(loaddr, lo); @@ -2096,7 +2132,9 @@ pub unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: f32x8) /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. #[inline(always)] #[target_feature = "+avx,+sse2"] -pub unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: f64x4) { +pub unsafe fn _mm256_storeu2_m128d( + hiaddr: *mut f64, loaddr: *mut f64, a: f64x4 +) { use x86::sse2::_mm_storeu_pd; let lo = _mm256_castpd256_pd128(a); _mm_storeu_pd(loaddr, lo); @@ -2109,7 +2147,9 @@ pub unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: f64x4) /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. #[inline(always)] #[target_feature = "+avx,+sse2"] -pub unsafe fn _mm256_storeu2_m128i(hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i) { +pub unsafe fn _mm256_storeu2_m128i( + hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i +) { use x86::sse2::_mm_storeu_si128; let lo = _mm256_castsi256_si128(a); _mm_storeu_si128(loaddr, lo); @@ -2265,9 +2305,9 @@ extern "C" { #[cfg(test)] mod tests { use stdsimd_test::simd_test; - use test::black_box; // Used to inhibit constant-folding. + use test::black_box; // Used to inhibit constant-folding. - use v128::{f32x4, f64x2, i8x16, i32x4, i64x2}; + use v128::{f32x4, f64x2, i32x4, i64x2, i8x16}; use v256::*; use x86::avx; use x86::{__m128i, __m256i}; @@ -2428,7 +2468,7 @@ mod tests { let a = f64x4::new(1., 2., 3., 4.); let b = f64x4::new(5., 6., 7., 8.); let r = avx::_mm256_sub_pd(a, b); - let e = f64x4::new(-4.,-4.,-4.,-4.); + let e = f64x4::new(-4., -4., -4., -4.); assert_eq!(r, e); } @@ -2504,7 +2544,7 @@ mod tests { #[simd_test = "avx"] unsafe fn _mm256_sqrt_pd() { let a = f64x4::new(4., 9., 16., 25.); - let r = avx::_mm256_sqrt_pd(a, ); + let r = avx::_mm256_sqrt_pd(a); let e = f64x4::new(2., 3., 4., 5.); assert_eq!(r, e); } @@ -2561,7 +2601,10 @@ mod tests { unsafe fn _mm256_blendv_ps() { let a = f32x8::new(4., 9., 16., 25., 4., 9., 16., 25.); let b = f32x8::new(4., 3., 2., 5., 8., 9., 64., 50.0); - let c = f32x8::new(0., 0., 0., 0., !0 as f32, !0 as f32, !0 as f32, !0 as f32); + #[cfg_attr(rustfmt, rustfmt_skip)] + let c = f32x8::new( + 0., 0., 0., 0., !0 as f32, !0 as f32, !0 as f32, !0 as f32, + ); let r = avx::_mm256_blendv_ps(a, b, c); let e = f32x8::new(4., 9., 16., 25., 8., 9., 64., 50.0); assert_eq!(r, e); @@ -2572,7 +2615,8 @@ mod tests { let a = f32x8::new(4., 9., 16., 25., 4., 9., 16., 25.); let b = f32x8::new(4., 3., 2., 5., 8., 9., 64., 50.0); let r = avx::_mm256_dp_ps(a, b, 0xFF); - let e = f32x8::new(200.0, 200.0, 200.0, 200.0, 2387., 2387., 2387., 2387.); + let e = + f32x8::new(200.0, 200.0, 200.0, 200.0, 2387., 2387., 2387., 2387.); assert_eq!(r, e); } @@ -2801,20 +2845,21 @@ mod tests { #[simd_test = "avx"] unsafe fn _mm256_extract_epi8() { + #[cfg_attr(rustfmt, rustfmt_skip)] let a = i8x32::new( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, - 25, 26, 27, 28, 29, 30, 31, 32); + 25, 26, 27, 28, 29, 30, 31, 32, + ); let r = avx::_mm256_extract_epi8(a, 0); assert_eq!(r, 1); } #[simd_test = "avx"] unsafe fn _mm256_extract_epi16() { - let a = i16x16::new( - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15); + let a = + i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let r = avx::_mm256_extract_epi16(a, 0); assert_eq!(r, 0); } @@ -3004,29 +3049,31 @@ mod tests { #[simd_test = "avx"] unsafe fn _mm256_insert_epi8() { + #[cfg_attr(rustfmt, rustfmt_skip)] let a = i8x32::new( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, - 25, 26, 27, 28, 29, 30, 31, 32); + 25, 26, 27, 28, 29, 30, 31, 32, + ); let r = avx::_mm256_insert_epi8(a, 0, 31); + #[cfg_attr(rustfmt, rustfmt_skip)] let e = i8x32::new( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, - 25, 26, 27, 28, 29, 30, 31, 0); + 25, 26, 27, 28, 29, 30, 31, 0, + ); assert_eq!(r, e); } #[simd_test = "avx"] unsafe fn _mm256_insert_epi16() { - let a = i16x16::new( - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15); + let a = + i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let r = avx::_mm256_insert_epi16(a, 0, 15); - let e = i16x16::new( - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 0); + let e = + i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0); assert_eq!(r, e); } @@ -3203,18 +3250,22 @@ mod tests { #[simd_test = "avx"] unsafe fn _mm256_lddqu_si256() { + #[cfg_attr(rustfmt, rustfmt_skip)] let a = i8x32::new( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, - 25, 26, 27, 28, 29, 30, 31, 32); + 25, 26, 27, 28, 29, 30, 31, 32, + ); let p = &a as *const _; let r = avx::_mm256_lddqu_si256(black_box(p)); + #[cfg_attr(rustfmt, rustfmt_skip)] let e = i8x32::new( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, - 25, 26, 27, 28, 29, 30, 31, 32); + 25, 26, 27, 28, 29, 30, 31, 32, + ); assert_eq!(r, e); } @@ -3222,8 +3273,11 @@ mod tests { unsafe fn _mm256_rcp_ps() { let a = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.); let r = avx::_mm256_rcp_ps(a); - let e = f32x8::new(0.99975586, 0.49987793, 0.33325195, 0.24993896, - 0.19995117, 0.16662598, 0.14282227, 0.12496948); + #[cfg_attr(rustfmt, rustfmt_skip)] + let e = f32x8::new( + 0.99975586, 0.49987793, 0.33325195, 0.24993896, + 0.19995117, 0.16662598, 0.14282227, 0.12496948, + ); let rel_err = 0.00048828125; for i in 0..8 { assert_approx_eq!(r.extract(i), e.extract(i), 2. * rel_err); @@ -3234,8 +3288,11 @@ mod tests { unsafe fn _mm256_rsqrt_ps() { let a = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.); let r = avx::_mm256_rsqrt_ps(a); - let e = f32x8::new(0.99975586, 0.7069092, 0.5772705, 0.49987793, - 0.44714355, 0.40820313, 0.3779297, 0.3534546); + #[cfg_attr(rustfmt, rustfmt_skip)] + let e = f32x8::new( + 0.99975586, 0.7069092, 0.5772705, 0.49987793, + 0.44714355, 0.40820313, 0.3779297, 0.3534546, + ); let rel_err = 0.00048828125; for i in 0..8 { assert_approx_eq!(r.extract(i), e.extract(i), 2. * rel_err); @@ -3478,30 +3535,39 @@ mod tests { #[simd_test = "avx"] unsafe fn _mm256_set_epi8() { + #[cfg_attr(rustfmt, rustfmt_skip)] let r = avx::_mm256_set_epi8( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, - 25, 26, 27, 28, 29, 30, 31, 32); - assert_eq!(r, i8x32::new(32, 31, 30, 29, 28, 27, 26, 25, - 24, 23, 22, 21, 20, 19, 18, 17, - 16, 15, 14, 13, 12, 11, 10, 9, - 8, 7, 6, 5, 4, 3, 2, 1)); + 25, 26, 27, 28, 29, 30, 31, 32, + ); + #[cfg_attr(rustfmt, rustfmt_skip)] + let e = i8x32::new( + 32, 31, 30, 29, 28, 27, 26, 25, + 24, 23, 22, 21, 20, 19, 18, 17, + 16, 15, 14, 13, 12, 11, 10, 9, + 8, 7, 6, 5, 4, 3, 2, 1 + ); + assert_eq!(r, e); } #[simd_test = "avx"] unsafe fn _mm256_set_epi16() { + #[cfg_attr(rustfmt, rustfmt_skip)] let r = avx::_mm256_set_epi16( 1, 2, 3, 4, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 16); - assert_eq!(r, i16x16::new(16, 15, 14, 13, 12, 11, 10, 9, - 8, 7, 6, 5, 4, 3, 2, 1)); + 9, 10, 11, 12, 13, 14, 15, 16, + ); + assert_eq!( + r, + i16x16::new(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1) + ); } #[simd_test = "avx"] unsafe fn _mm256_set_epi32() { - let r = avx::_mm256_set_epi32( - 1, 2, 3, 4, 5, 6, 7, 8); + let r = avx::_mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); assert_eq!(r, i32x8::new(8, 7, 6, 5, 4, 3, 2, 1)); } @@ -3525,30 +3591,40 @@ mod tests { #[simd_test = "avx"] unsafe fn _mm256_setr_epi8() { + #[cfg_attr(rustfmt, rustfmt_skip)] let r = avx::_mm256_setr_epi8( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, - 25, 26, 27, 28, 29, 30, 31, 32); - assert_eq!(r, i8x32::new(1, 2, 3, 4, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 16, - 17, 18, 19, 20, 21, 22, 23, 24, - 25, 26, 27, 28, 29, 30, 31, 32)); + 25, 26, 27, 28, 29, 30, 31, 32, + ); + #[cfg_attr(rustfmt, rustfmt_skip)] + let e = i8x32::new( + 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32 + ); + + assert_eq!(r, e); } #[simd_test = "avx"] unsafe fn _mm256_setr_epi16() { + #[cfg_attr(rustfmt, rustfmt_skip)] let r = avx::_mm256_setr_epi16( 1, 2, 3, 4, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 16); - assert_eq!(r, i16x16::new(1, 2, 3, 4, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 16)); + 9, 10, 11, 12, 13, 14, 15, 16, + ); + assert_eq!( + r, + i16x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16) + ); } #[simd_test = "avx"] unsafe fn _mm256_setr_epi32() { - let r = avx::_mm256_setr_epi32( - 1, 2, 3, 4, 5, 6, 7, 8); + let r = avx::_mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); assert_eq!(r, i32x8::new(1, 2, 3, 4, 5, 6, 7, 8)); } @@ -3614,19 +3690,25 @@ mod tests { unsafe fn _mm256_castps_si256() { let a = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.); let r = avx::_mm256_castps_si256(a); - let e = i8x32::new(0, 0, -128, 63, 0, 0, 0, 64, - 0, 0, 64, 64, 0, 0, -128, 64, - 0, 0, -96, 64, 0, 0, -64, 64, - 0, 0, -32, 64, 0, 0, 0, 65); + #[cfg_attr(rustfmt, rustfmt_skip)] + let e = i8x32::new( + 0, 0, -128, 63, 0, 0, 0, 64, + 0, 0, 64, 64, 0, 0, -128, 64, + 0, 0, -96, 64, 0, 0, -64, 64, + 0, 0, -32, 64, 0, 0, 0, 65, + ); assert_eq!(r, e); } #[simd_test = "avx"] unsafe fn _mm256_castsi256_ps() { - let a = i8x32::new(0, 0, -128, 63, 0, 0, 0, 64, - 0, 0, 64, 64, 0, 0, -128, 64, - 0, 0, -96, 64, 0, 0, -64, 64, - 0, 0, -32, 64, 0, 0, 0, 65); + #[cfg_attr(rustfmt, rustfmt_skip)] + let a = i8x32::new( + 0, 0, -128, 63, 0, 0, 0, 64, + 0, 0, 64, 64, 0, 0, -128, 64, + 0, 0, -96, 64, 0, 0, -64, 64, + 0, 0, -32, 64, 0, 0, 0, 65, + ); let r = avx::_mm256_castsi256_ps(a); let e = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.); assert_eq!(r, e); @@ -3711,16 +3793,23 @@ mod tests { #[simd_test = "avx"] unsafe fn _mm256_set_m128i() { - let hi = i8x16::new(17, 18, 19, 20, 21, 22, 23, 24, - 25, 26, 27, 28, 29, 30, 31, 32); - let lo = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 16); + #[cfg_attr(rustfmt, rustfmt_skip)] + let hi = i8x16::new( + 17, 18, 19, 20, + 21, 22, 23, 24, + 25, 26, 27, 28, + 29, 30, 31, 32, + ); + let lo = + i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); let r = avx::_mm256_set_m128i(hi, lo); + #[cfg_attr(rustfmt, rustfmt_skip)] let e = i8x32::new( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, - 25, 26, 27, 28, 29, 30, 31, 32); + 25, 26, 27, 28, 29, 30, 31, 32, + ); assert_eq!(r, e); } @@ -3744,16 +3833,21 @@ mod tests { #[simd_test = "avx"] unsafe fn _mm256_setr_m128i() { - let lo = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 16); - let hi = i8x16::new(17, 18, 19, 20, 21, 22, 23, 24, - 25, 26, 27, 28, 29, 30, 31, 32); + let lo = + i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + #[cfg_attr(rustfmt, rustfmt_skip)] + let hi = i8x16::new( + 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, + ); let r = avx::_mm256_setr_m128i(lo, hi); + #[cfg_attr(rustfmt, rustfmt_skip)] let e = i8x32::new( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, - 25, 26, 27, 28, 29, 30, 31, 32); + 25, 26, 27, 28, 29, 30, 31, 32, + ); assert_eq!(r, e); } @@ -3781,17 +3875,24 @@ mod tests { #[simd_test = "avx"] unsafe fn _mm256_loadu2_m128i() { - let hi = i8x16::new(17, 18, 19, 20, 21, 22, 23, 24, - 25, 26, 27, 28, 29, 30, 31, 32); - let lo = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 16); - let r = avx::_mm256_loadu2_m128i(&hi as *const _ as *const _, - &lo as *const _ as *const _); + #[cfg_attr(rustfmt, rustfmt_skip)] + let hi = i8x16::new( + 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, + ); + let lo = + i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + let r = avx::_mm256_loadu2_m128i( + &hi as *const _ as *const _, + &lo as *const _ as *const _, + ); + #[cfg_attr(rustfmt, rustfmt_skip)] let e = i8x32::new( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, - 25, 26, 27, 28, 29, 30, 31, 32); + 25, 26, 27, 28, 29, 30, 31, 32, + ); assert_eq!(r, e); } @@ -3801,9 +3902,11 @@ mod tests { let a = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.); let mut hi = _mm_undefined_ps(); let mut lo = _mm_undefined_ps(); - avx::_mm256_storeu2_m128(&mut hi as *mut _ as *mut f32, - &mut lo as *mut _ as *mut f32, - a); + avx::_mm256_storeu2_m128( + &mut hi as *mut _ as *mut f32, + &mut lo as *mut _ as *mut f32, + a, + ); assert_eq!(hi, f32x4::new(5., 6., 7., 8.)); assert_eq!(lo, f32x4::new(1., 2., 3., 4.)); } @@ -3814,9 +3917,11 @@ mod tests { let a = f64x4::new(1., 2., 3., 4.); let mut hi = _mm_undefined_pd(); let mut lo = _mm_undefined_pd(); - avx::_mm256_storeu2_m128d(&mut hi as *mut _ as *mut f64, - &mut lo as *mut _ as *mut f64, - a); + avx::_mm256_storeu2_m128d( + &mut hi as *mut _ as *mut f64, + &mut lo as *mut _ as *mut f64, + a, + ); assert_eq!(hi, f64x2::new(3., 4.)); assert_eq!(lo, f64x2::new(1., 2.)); } @@ -3824,17 +3929,26 @@ mod tests { #[simd_test = "avx"] unsafe fn _mm256_storeu2_m128i() { use x86::sse2::_mm_undefined_si128; + #[cfg_attr(rustfmt, rustfmt_skip)] let a = i8x32::new( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, - 25, 26, 27, 28, 29, 30, 31, 32); + 25, 26, 27, 28, 29, 30, 31, 32, + ); let mut hi = _mm_undefined_si128(); let mut lo = _mm_undefined_si128(); avx::_mm256_storeu2_m128i(&mut hi as *mut _, &mut lo as *mut _, a); - assert_eq!(hi, i8x16::new(17, 18, 19, 20, 21, 22, 23, 24, - 25, 26, 27, 28, 29, 30, 31, 32)); - assert_eq!(lo, i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 16)); + #[cfg_attr(rustfmt, rustfmt_skip)] + let e = i8x16::new( + 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32 + ); + + assert_eq!(hi, e); + assert_eq!( + lo, + i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16) + ); } } diff --git a/library/stdarch/src/x86/avx2.rs b/library/stdarch/src/x86/avx2.rs index 194ca8f16305..edbf72064329 100644 --- a/library/stdarch/src/x86/avx2.rs +++ b/library/stdarch/src/x86/avx2.rs @@ -96,8 +96,8 @@ pub unsafe fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 { paddusw(a, b) } -/// Concatenate pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary result, -/// shift the result right by `n` bytes, and return the low 16 bytes. +/// Concatenate pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary +/// result, shift the result right by `n` bytes, and return the low 16 bytes. #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpalignr, n = 15))] @@ -116,7 +116,9 @@ pub unsafe fn _mm256_alignr_epi8(a: i8x32, b: i8x32, n: i32) -> i8x32 { (a, b, n) }; - const fn add(a: u32, b: u32) -> u32 { a + b } + const fn add(a: u32, b: u32) -> u32 { + a + b + } macro_rules! shuffle { ($shift:expr) => { simd_shuffle32(b, a, [ @@ -140,14 +142,22 @@ pub unsafe fn _mm256_alignr_epi8(a: i8x32, b: i8x32, n: i32) -> i8x32 { } } match n { - 0 => shuffle!(0), 1 => shuffle!(1), - 2 => shuffle!(2), 3 => shuffle!(3), - 4 => shuffle!(4), 5 => shuffle!(5), - 6 => shuffle!(6), 7 => shuffle!(7), - 8 => shuffle!(8), 9 => shuffle!(9), - 10 => shuffle!(10), 11 => shuffle!(11), - 12 => shuffle!(12), 13 => shuffle!(13), - 14 => shuffle!(14), 15 => shuffle!(15), + 0 => shuffle!(0), + 1 => shuffle!(1), + 2 => shuffle!(2), + 3 => shuffle!(3), + 4 => shuffle!(4), + 5 => shuffle!(5), + 6 => shuffle!(6), + 7 => shuffle!(7), + 8 => shuffle!(8), + 9 => shuffle!(9), + 10 => shuffle!(10), + 11 => shuffle!(11), + 12 => shuffle!(12), + 13 => shuffle!(13), + 14 => shuffle!(14), + 15 => shuffle!(15), _ => shuffle!(16), } } @@ -174,7 +184,7 @@ pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpavgw))] -pub unsafe fn _mm256_avg_epu16 (a: u16x16, b: u16x16) -> u16x16 { +pub unsafe fn _mm256_avg_epu16(a: u16x16, b: u16x16) -> u16x16 { pavgw(a, b) } @@ -182,7 +192,7 @@ pub unsafe fn _mm256_avg_epu16 (a: u16x16, b: u16x16) -> u16x16 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpavgb))] -pub unsafe fn _mm256_avg_epu8 (a: u8x32, b: u8x32) -> u8x32 { +pub unsafe fn _mm256_avg_epu8(a: u8x32, b: u8x32) -> u8x32 { pavgb(a, b) } @@ -320,8 +330,8 @@ pub unsafe fn _mm256_blend_epi16(a: i16x16, b: i16x16, imm8: i32) -> i16x16 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpblendvb))] -pub unsafe fn _mm256_blendv_epi8(a:i8x32,b:i8x32,mask:__m256i) -> i8x32 { - pblendvb(a,b,mask) +pub unsafe fn _mm256_blendv_epi8(a: i8x32, b: i8x32, mask: __m256i) -> i8x32 { + pblendvb(a, b, mask) } /// Broadcast the low packed 8-bit integer from `a` to all elements of @@ -628,37 +638,83 @@ pub unsafe fn _mm256_hsubs_epi16(a: i16x16, b: i16x16) -> i16x16 { } -// TODO _mm_i32gather_epi32 (int const* base_addr, __m128i vindex, const int scale) -// TODO _mm_mask_i32gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale) -// TODO _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale) -// TODO _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale) -// TODO _mm_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale) -// TODO _mm_mask_i32gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale) -// TODO _mm256_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale) -// TODO _mm256_mask_i32gather_epi64 (__m256i src, __int64 const* base_addr, __m128i vindex, __m256i mask, const int scale) -// TODO _mm_i32gather_pd (double const* base_addr, __m128i vindex, const int scale) -// TODO _mm_mask_i32gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale) -// TODO _mm256_i32gather_pd (double const* base_addr, __m128i vindex, const int scale) -// TODO _mm256_mask_i32gather_pd (__m256d src, double const* base_addr, __m128i vindex, __m256d mask, const int scale) -// TODO _mm_i32gather_ps (float const* base_addr, __m128i vindex, const int scale) -// TODO _mm_mask_i32gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale) -// TODO _mm256_i32gather_ps (float const* base_addr, __m256i vindex, const int scale) -// TODO _mm256_mask_i32gather_ps (__m256 src, float const* base_addr, __m256i vindex, __m256 mask, const int scale) -// TODO _mm_i64gather_epi32 (int const* base_addr, __m128i vindex, const int scale) -// TODO _mm_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale) -// TODO _mm256_i64gather_epi32 (int const* base_addr, __m256i vindex, const int scale) -// TODO _mm256_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m256i vindex, __m128i mask, const int scale) -// TODO _mm_i64gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale) -// TODO _mm_mask_i64gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale) -// TODO _mm256_i64gather_epi64 (__int64 const* base_addr, __m256i vindex, const int scale) -// TODO _mm256_mask_i64gather_epi64 (__m256i src, __int64 const* base_addr, __m256i vindex, __m256i mask, const int scale) -// TODO _mm_i64gather_pd (double const* base_addr, __m128i vindex, const int scale) -// TODO _mm_mask_i64gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale) -// TODO _mm256_i64gather_pd (double const* base_addr, __m256i vindex, const int scale) -// TODO _mm256_mask_i64gather_pd (__m256d src, double const* base_addr, __m256i vindex, __m256d mask, const int scale) -// TODO _mm_i64gather_ps (float const* base_addr, __m128i vindex, const int scale) -// TODO _mm_mask_i64gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale) -// TODO _mm256_i64gather_ps (float const* base_addr, __m256i vindex, const int scale) +// TODO _mm_i32gather_epi32 (int const* base_addr, __m128i vindex, +// const int scale) +// TODO _mm_mask_i32gather_epi32 (__m128i src, int const* base_addr, +// __m128i vindex, __m128i mask, +// const int scale) +// TODO _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, +// const int scale) +// TODO _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, +// __m256i vindex, __m256i mask, +// const int scale) +// TODO _mm_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, +// const int scale) +// TODO _mm_mask_i32gather_epi64 (__m128i src, __int64 const* base_addr, +// __m128i vindex, __m128i mask, +// const int scale) +// TODO _mm256_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, +// const int scale) +// TODO _mm256_mask_i32gather_epi64 (__m256i src, __int64 const* base_addr, +// __m128i vindex, __m256i mask, +// const int scale) +// TODO _mm_i32gather_pd (double const* base_addr, __m128i vindex, +// const int scale) +// TODO _mm_mask_i32gather_pd (__m128d src, double const* base_addr, +// __m128i vindex, __m128d mask, +// const int scale) +// TODO _mm256_i32gather_pd (double const* base_addr, __m128i vindex, +// const int scale) +// TODO _mm256_mask_i32gather_pd (__m256d src, double const* base_addr, +// __m128i vindex, __m256d mask, +// const int scale) +// TODO _mm_i32gather_ps (float const* base_addr, __m128i vindex, +// const int scale) +// TODO _mm_mask_i32gather_ps (__m128 src, float const* base_addr, +// __m128i vindex, __m128 mask, +// const int scale) +// TODO _mm256_i32gather_ps (float const* base_addr, __m256i vindex, +// const int scale) +// TODO _mm256_mask_i32gather_ps (__m256 src, float const* base_addr, +// __m256i vindex, __m256 mask, +// const int scale) +// TODO _mm_i64gather_epi32 (int const* base_addr, __m128i vindex, +// const int scale) +// TODO _mm_mask_i64gather_epi32 (__m128i src, int const* base_addr, +// __m128i vindex, __m128i mask, +// const int scale) +// TODO _mm256_i64gather_epi32 (int const* base_addr, __m256i vindex, +// const int scale) +// TODO _mm256_mask_i64gather_epi32 (__m128i src, int const* base_addr, +// __m256i vindex, __m128i mask, +// const int scale) +// TODO _mm_i64gather_epi64 (__int64 const* base_addr, __m128i vindex, +// const int scale) +// TODO _mm_mask_i64gather_epi64 (__m128i src, __int64 const* base_addr, +// __m128i vindex, __m128i mask, +// const int scale) +// TODO _mm256_i64gather_epi64 (__int64 const* base_addr, __m256i vindex, +// const int scale) +// TODO _mm256_mask_i64gather_epi64 (__m256i src, __int64 const* base_addr, +// __m256i vindex, __m256i mask, +// const int scale) +// TODO _mm_i64gather_pd (double const* base_addr, __m128i vindex, +// const int scale) +// TODO _mm_mask_i64gather_pd (__m128d src, double const* base_addr, +// __m128i vindex, __m128d mask, +// const int scale) +// TODO _mm256_i64gather_pd (double const* base_addr, __m256i vindex, +// const int scale) +// TODO _mm256_mask_i64gather_pd (__m256d src, double const* base_addr, +// __m256i vindex, __m256d mask, +// const int scale) +// TODO _mm_i64gather_ps (float const* base_addr, __m128i vindex, +// const int scale) +// TODO _mm_mask_i64gather_ps (__m128 src, float const* base_addr, +// __m128i vindex, __m128 mask, +// const int scale) +// TODO _mm256_i64gather_ps (float const* base_addr, __m256i vindex, +// const int scale) // TODO _mm256_mask_i64gather_ps // TODO _mm256_inserti128_si256 @@ -946,7 +1002,7 @@ pub unsafe fn _mm256_mulhi_epu16(a: u16x16, b: u16x16) -> u16x16 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpmullw))] -pub unsafe fn _mm256_mullo_epi16(a: i16x16, b:i16x16) -> i16x16 { +pub unsafe fn _mm256_mullo_epi16(a: i16x16, b: i16x16) -> i16x16 { a * b } @@ -957,7 +1013,7 @@ pub unsafe fn _mm256_mullo_epi16(a: i16x16, b:i16x16) -> i16x16 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpmulld))] -pub unsafe fn _mm256_mullo_epi32(a: i32x8, b:i32x8) -> i32x8 { +pub unsafe fn _mm256_mullo_epi32(a: i32x8, b: i32x8) -> i32x8 { a * b } @@ -968,7 +1024,7 @@ pub unsafe fn _mm256_mullo_epi32(a: i32x8, b:i32x8) -> i32x8 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpmulhrsw))] -pub unsafe fn _mm256_mulhrs_epi16(a: i16x16, b:i16x16) -> i16x16 { +pub unsafe fn _mm256_mulhrs_epi16(a: i16x16, b: i16x16) -> i16x16 { pmulhrsw(a, b) } @@ -1088,7 +1144,7 @@ pub unsafe fn _mm256_permute4x64_epi64(a: i64x4, imm8: i32) -> i64x4 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsadbw))] -pub unsafe fn _mm256_sad_epu8 (a: u8x32, b: u8x32) -> u64x4 { +pub unsafe fn _mm256_sad_epu8(a: u8x32, b: u8x32) -> u64x4 { psadbw(a, b) } @@ -1580,15 +1636,19 @@ pub unsafe fn _mm256_subs_epu8(a: u8x32, b: u8x32) -> u8x32 { /// use stdsimd::simd::i8x32; /// use stdsimd::vendor::_mm256_unpackhi_epi8; /// -/// let a = i8x32::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); -/// let b = i8x32::new(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31); +/// let a = i8x32::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, +/// 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); +/// let b = i8x32::new(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15, +/// -16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31); /// /// let c: i8x32; /// unsafe { /// c = _mm256_unpackhi_epi8(a, b); /// } /// -/// let expected = i8x32::new(8,-8, 9,-9, 10,-10, 11,-11, 12,-12, 13,-13, 14,-14, 15,-15, 24,-24, 25,-25, 26,-26, 27,-27, 28,-28, 29,-29, 30,-30, 31,-31); +/// let expected = i8x32::new(8,-8, 9,-9, 10,-10, 11,-11, 12,-12, 13,-13, +/// 14,-14, 15,-15, 24,-24, 25,-25, 26,-26, 27,-27, 28,-28, 29,-29, 30,-30, +/// 31,-31); /// assert_eq!(c, expected); /// /// # } @@ -1600,7 +1660,13 @@ pub unsafe fn _mm256_subs_epu8(a: u8x32, b: u8x32) -> u8x32 { #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpunpckhbw))] pub unsafe fn _mm256_unpackhi_epi8(a: i8x32, b: i8x32) -> i8x32 { - simd_shuffle32(a, b, [8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63]) + #[cfg_attr(rustfmt, rustfmt_skip)] + simd_shuffle32(a, b, [ + 8, 40, 9, 41, 10, 42, 11, 43, + 12, 44, 13, 45, 14, 46, 15, 47, + 24, 56, 25, 57, 26, 58, 27, 59, + 28, 60, 29, 61, 30, 62, 31, 63, + ]) } /// Unpack and interleave 8-bit integers from the low half of each @@ -1619,15 +1685,18 @@ pub unsafe fn _mm256_unpackhi_epi8(a: i8x32, b: i8x32) -> i8x32 { /// use stdsimd::simd::i8x32; /// use stdsimd::vendor::_mm256_unpacklo_epi8; /// -/// let a = i8x32::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); -/// let b = i8x32::new(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31); +/// let a = i8x32::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, +/// 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); +/// let b = i8x32::new(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15, +/// -16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31); /// /// let c: i8x32; /// unsafe { /// c = _mm256_unpacklo_epi8(a, b); /// } /// -/// let expected = i8x32::new(0, 0, 1,-1, 2,-2, 3,-3, 4,-4, 5,-5, 6,-6, 7,-7, 16,-16, 17,-17, 18,-18, 19,-19, 20,-20, 21,-21, 22,-22, 23,-23); +/// let expected = i8x32::new(0, 0, 1,-1, 2,-2, 3,-3, 4,-4, 5,-5, 6,-6, 7,-7, +/// 16,-16, 17,-17, 18,-18, 19,-19, 20,-20, 21,-21, 22,-22, 23,-23); /// assert_eq!(c, expected); /// /// # } @@ -1639,7 +1708,13 @@ pub unsafe fn _mm256_unpackhi_epi8(a: i8x32, b: i8x32) -> i8x32 { #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpunpcklbw))] pub unsafe fn _mm256_unpacklo_epi8(a: i8x32, b: i8x32) -> i8x32 { - simd_shuffle32(a, b, [0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55]) + #[cfg_attr(rustfmt, rustfmt_skip)] + simd_shuffle32(a, b, [ + 0, 32, 1, 33, 2, 34, 3, 35, + 4, 36, 5, 37, 6, 38, 7, 39, + 16, 48, 17, 49, 18, 50, 19, 51, + 20, 52, 21, 53, 22, 54, 23, 55, + ]) } /// Unpack and interleave 16-bit integers from the high half of each @@ -1666,7 +1741,8 @@ pub unsafe fn _mm256_unpacklo_epi8(a: i8x32, b: i8x32) -> i8x32 { /// c = _mm256_unpackhi_epi16(a, b); /// } /// -/// let expected = i16x16::new(4,-4, 5,-5, 6,-6, 7,-7, 12,-12, 13,-13, 14,-14, 15,-15); +/// let expected = i16x16::new(4,-4, 5,-5, 6,-6, 7,-7, 12,-12, 13,-13, 14,-14, +/// 15,-15); /// assert_eq!(c, expected); /// /// # } @@ -1678,7 +1754,11 @@ pub unsafe fn _mm256_unpacklo_epi8(a: i8x32, b: i8x32) -> i8x32 { #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpunpckhwd))] pub unsafe fn _mm256_unpackhi_epi16(a: i16x16, b: i16x16) -> i16x16 { - simd_shuffle16(a, b, [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31]) + simd_shuffle16( + a, + b, + [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31], + ) } /// Unpack and interleave 16-bit integers from the low half of each @@ -1705,7 +1785,8 @@ pub unsafe fn _mm256_unpackhi_epi16(a: i16x16, b: i16x16) -> i16x16 { /// c = _mm256_unpacklo_epi16(a, b); /// } /// -/// let expected = i16x16::new(0, 0, 1,-1, 2,-2, 3,-3, 8,-8, 9,-9, 10,-10, 11,-11); +/// let expected = i16x16::new(0, 0, 1,-1, 2,-2, 3,-3, 8,-8, 9,-9, 10,-10, +/// 11,-11); /// assert_eq!(c, expected); /// /// # } @@ -1717,7 +1798,11 @@ pub unsafe fn _mm256_unpackhi_epi16(a: i16x16, b: i16x16) -> i16x16 { #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpunpcklwd))] pub unsafe fn _mm256_unpacklo_epi16(a: i16x16, b: i16x16) -> i16x16 { - simd_shuffle16(a, b, [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27]) + simd_shuffle16( + a, + b, + [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27], + ) } /// Unpack and interleave 32-bit integers from the high half of each @@ -1972,9 +2057,9 @@ extern "C" { #[link_name = "llvm.x86.avx2.pmulh.w"] fn pmulhw(a: i16x16, b: i16x16) -> i16x16; #[link_name = "llvm.x86.avx2.pmul.dq"] - fn pmuldq(a: i32x8, b:i32x8) -> i64x4; + fn pmuldq(a: i32x8, b: i32x8) -> i64x4; #[link_name = "llvm.x86.avx2.pmulu.dq"] - fn pmuludq(a: u32x8, b:u32x8) -> u64x4; + fn pmuludq(a: u32x8, b: u32x8) -> u64x4; #[link_name = "llvm.x86.avx2.pmul.hr.sw"] fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16; #[link_name = "llvm.x86.avx2.packsswb"] @@ -2006,17 +2091,17 @@ extern "C" { #[link_name = "llvm.x86.avx2.pslli.q"] fn pslliq(a: i64x4, imm8: i32) -> i64x4; #[link_name = "llvm.x86.avx2.psllv.d"] - fn psllvd(a:i32x4, count:i32x4) -> i32x4; + fn psllvd(a: i32x4, count: i32x4) -> i32x4; #[link_name = "llvm.x86.avx2.psllv.d.256"] - fn psllvd256(a:i32x8, count:i32x8) -> i32x8; + fn psllvd256(a: i32x8, count: i32x8) -> i32x8; #[link_name = "llvm.x86.avx2.psllv.q"] - fn psllvq(a:i64x2, count:i64x2) -> i64x2; + fn psllvq(a: i64x2, count: i64x2) -> i64x2; #[link_name = "llvm.x86.avx2.psllv.q.256"] - fn psllvq256(a:i64x4, count:i64x4) -> i64x4; + fn psllvq256(a: i64x4, count: i64x4) -> i64x4; #[link_name = "llvm.x86.avx2.psra.w"] - fn psraw(a: i16x16, count:i16x8) -> i16x16; + fn psraw(a: i16x16, count: i16x8) -> i16x16; #[link_name = "llvm.x86.avx2.psra.d"] - fn psrad(a: i32x8, count:i32x4) -> i32x8; + fn psrad(a: i32x8, count: i32x4) -> i32x8; #[link_name = "llvm.x86.avx2.psrai.w"] fn psraiw(a: i16x16, imm8: i32) -> i16x16; #[link_name = "llvm.x86.avx2.psrai.d"] @@ -2026,11 +2111,11 @@ extern "C" { #[link_name = "llvm.x86.avx2.psrav.d.256"] fn psravd256(a: i32x8, count: i32x8) -> i32x8; #[link_name = "llvm.x86.avx2.psrl.w"] - fn psrlw(a: i16x16, count:i16x8) -> i16x16; + fn psrlw(a: i16x16, count: i16x8) -> i16x16; #[link_name = "llvm.x86.avx2.psrl.d"] - fn psrld(a: i32x8, count:i32x4) -> i32x8; + fn psrld(a: i32x8, count: i32x4) -> i32x8; #[link_name = "llvm.x86.avx2.psrl.q"] - fn psrlq(a: i64x4, count:i64x2) -> i64x4; + fn psrlq(a: i64x4, count: i64x2) -> i64x4; #[link_name = "llvm.x86.avx2.psrli.w"] fn psrliw(a: i16x16, imm8: i32) -> i16x16; #[link_name = "llvm.x86.avx2.psrli.d"] @@ -2071,49 +2156,53 @@ mod tests { #[simd_test = "avx2"] unsafe fn _mm256_abs_epi32() { + #[cfg_attr(rustfmt, rustfmt_skip)] let a = i32x8::new( 0, 1, -1, std::i32::MAX, - std::i32::MIN + 1, 100, -100, -32); + std::i32::MIN + 1, 100, -100, -32, + ); let r = avx2::_mm256_abs_epi32(a); + #[cfg_attr(rustfmt, rustfmt_skip)] let e = i32x8::new( 0, 1, 1, std::i32::MAX, - (std::i32::MIN + 1).abs(), 100, 100, 32); + (std::i32::MIN + 1).abs(), 100, 100, 32, + ); assert_eq!(r, e); } #[simd_test = "avx2"] unsafe fn _mm256_abs_epi16() { + #[cfg_attr(rustfmt, rustfmt_skip)] let a = i16x16::new( - 0, 1, -1, 2, - -2, 3, -3, 4, - -4, 5, -5, std::i16::MAX, - std::i16::MIN + 1, 100, -100, -32); + 0, 1, -1, 2, -2, 3, -3, 4, + -4, 5, -5, std::i16::MAX, std::i16::MIN + 1, 100, -100, -32, + ); let r = avx2::_mm256_abs_epi16(a); + #[cfg_attr(rustfmt, rustfmt_skip)] let e = i16x16::new( - 0, 1, 1, 2, - 2, 3, 3, 4, - 4, 5, 5, std::i16::MAX, - (std::i16::MIN + 1).abs(), 100, 100, 32); + 0, 1, 1, 2, 2, 3, 3, 4, + 4, 5, 5, std::i16::MAX, (std::i16::MIN + 1).abs(), 100, 100, 32, + ); assert_eq!(r, e); } #[simd_test = "avx2"] unsafe fn _mm256_abs_epi8() { + #[cfg_attr(rustfmt, rustfmt_skip)] let a = i8x32::new( - 0, 1, -1, 2, - -2, 3, -3, 4, - -4, 5, -5, std::i8::MAX, - std::i8::MIN + 1, 100, -100, -32, - 0, 1, -1, 2, - -2, 3, -3, 4, - -4, 5, -5, std::i8::MAX, - std::i8::MIN + 1, 100, -100, -32); + 0, 1, -1, 2, -2, 3, -3, 4, + -4, 5, -5, std::i8::MAX, std::i8::MIN + 1, 100, -100, -32, + 0, 1, -1, 2, -2, 3, -3, 4, + -4, 5, -5, std::i8::MAX, std::i8::MIN + 1, 100, -100, -32, + ); let r = avx2::_mm256_abs_epi8(a); + #[cfg_attr(rustfmt, rustfmt_skip)] let e = i8x32::new( 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, std::i8::MAX, (std::i8::MIN + 1).abs(), 100, 100, 32, 0, 1, 1, 2, 2, 3, 3, 4, - 4, 5, 5, std::i8::MAX, (std::i8::MIN + 1).abs(), 100, 100, 32); + 4, 5, 5, std::i8::MAX, (std::i8::MIN + 1).abs(), 100, 100, 32, + ); assert_eq!(r, e); } @@ -2137,52 +2226,70 @@ mod tests { #[simd_test = "avx2"] unsafe fn _mm256_add_epi16() { - let a = i16x16::new( - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15); - let b = i16x16::new( - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15); + let a = + i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let b = + i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let r = avx2::_mm256_add_epi16(a, b); + #[cfg_attr(rustfmt, rustfmt_skip)] let e = i16x16::new( 0, 2, 4, 6, 8, 10, 12, 14, - 16, 18, 20, 22, 24, 26, 28, 30); + 16, 18, 20, 22, 24, 26, 28, 30, + ); assert_eq!(r, e); } #[simd_test = "avx2"] unsafe fn _mm256_add_epi8() { + #[cfg_attr(rustfmt, rustfmt_skip)] let a = i8x32::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31); + 24, 25, 26, 27, 28, 29, 30, 31, + ); + #[cfg_attr(rustfmt, rustfmt_skip)] let b = i8x32::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31); + 24, 25, 26, 27, 28, 29, 30, 31, + ); let r = avx2::_mm256_add_epi8(a, b); + #[cfg_attr(rustfmt, rustfmt_skip)] let e = i8x32::new( - 0, 2, 4, 6, 8, 10, 12, 14, 16, - 18, 20, 22, 24, 26, 28, 30, 32, - 34, 36, 38, 40, 42, 44, 46, 48, - 50, 52, 54, 56, 58, 60, 62); + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62, + ); assert_eq!(r, e); } #[simd_test = "avx2"] unsafe fn _mm256_adds_epi8() { + #[cfg_attr(rustfmt, rustfmt_skip)] let a = i8x32::new( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + ); + #[cfg_attr(rustfmt, rustfmt_skip)] let b = i8x32::new( - 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, - 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + ); let r = avx2::_mm256_adds_epi8(a, b); + #[cfg_attr(rustfmt, rustfmt_skip)] let e = i8x32::new( - 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, - 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94); + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62, + 64, 66, 68, 70, 72, 74, 76, 78, + 80, 82, 84, 86, 88, 90, 92, 94, + ); assert_eq!(r, e); } @@ -2204,13 +2311,19 @@ mod tests { #[simd_test = "avx2"] unsafe fn _mm256_adds_epi16() { - let a = i16x16::new( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let a = + i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + #[cfg_attr(rustfmt, rustfmt_skip)] let b = i16x16::new( - 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47); - let r = avx2::_mm256_adds_epi16(a, b); + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + ); + let r = avx2::_mm256_adds_epi16(a, b); + #[cfg_attr(rustfmt, rustfmt_skip)] let e = i16x16::new( - 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62); + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62, + ); assert_eq!(r, e); } @@ -2233,16 +2346,28 @@ mod tests { #[simd_test = "avx2"] unsafe fn _mm256_adds_epu8() { + #[cfg_attr(rustfmt, rustfmt_skip)] let a = u8x32::new( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + ); + #[cfg_attr(rustfmt, rustfmt_skip)] let b = u8x32::new( - 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, - 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + ); let r = avx2::_mm256_adds_epu8(a, b); + #[cfg_attr(rustfmt, rustfmt_skip)] let e = u8x32::new( - 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, - 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94); + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62, + 64, 66, 68, 70, 72, 74, 76, 78, + 80, 82, 84, 86, 88, 90, 92, 94, + ); assert_eq!(r, e); } @@ -2257,13 +2382,19 @@ mod tests { #[simd_test = "avx2"] unsafe fn _mm256_adds_epu16() { - let a = u16x16::new( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let a = + u16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + #[cfg_attr(rustfmt, rustfmt_skip)] let b = u16x16::new( - 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47); + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + ); let r = avx2::_mm256_adds_epu16(a, b); + #[cfg_attr(rustfmt, rustfmt_skip)] let e = u16x16::new( - 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62); + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62, + ); assert_eq!(r, e); } @@ -2346,11 +2477,11 @@ mod tests { #[simd_test = "avx2"] unsafe fn _mm256_blendv_epi8() { - let (a,b) = (i8x32::splat(4),i8x32::splat(2)); - let mask = i8x32::splat(0).replace(2,-1); - let e = i8x32::splat(4).replace(2,2); - let r= avx2::_mm256_blendv_epi8(a,b,mask); - assert_eq!(r,e); + let (a, b) = (i8x32::splat(4), i8x32::splat(2)); + let mask = i8x32::splat(0).replace(2, -1); + let e = i8x32::splat(4).replace(2, 2); + let r = avx2::_mm256_blendv_epi8(a, b, mask); + assert_eq!(r, e); } #[simd_test = "avx2"] @@ -2413,8 +2544,12 @@ mod tests { unsafe fn _mm256_broadcastsi128_si256() { let a = i64x2::new(0x0987654321012334, 0x5678909876543210); let res = avx2::_mm256_broadcastsi128_si256(a); - let retval = i64x4::new(0x0987654321012334, 0x5678909876543210, - 0x0987654321012334, 0x5678909876543210); + let retval = i64x4::new( + 0x0987654321012334, + 0x5678909876543210, + 0x0987654321012334, + 0x5678909876543210, + ); assert_eq!(res, retval); } @@ -2448,30 +2583,38 @@ mod tests { #[simd_test = "avx2"] unsafe fn _mm256_cmpeq_epi8() { + #[cfg_attr(rustfmt, rustfmt_skip)] let a = i8x32::new( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + ); + #[cfg_attr(rustfmt, rustfmt_skip)] let b = i8x32::new( - 31, 30, 2, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, - 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + 31, 30, 2, 28, 27, 26, 25, 24, + 23, 22, 21, 20, 19, 18, 17, 16, + 15, 14, 13, 12, 11, 10, 9, 8, + 7, 6, 5, 4, 3, 2, 1, 0, + ); let r = avx2::_mm256_cmpeq_epi8(a, b); - assert_eq!(r, i8x32::splat(0).replace(2,0xFFu8 as i8)); + assert_eq!(r, i8x32::splat(0).replace(2, 0xFFu8 as i8)); } #[simd_test = "avx2"] unsafe fn _mm256_cmpeq_epi16() { - let a = i16x16::new( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let b = i16x16::new( - 15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + let a = + i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let b = + i16x16::new(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); let r = avx2::_mm256_cmpeq_epi16(a, b); assert_eq!(r, i16x16::splat(0).replace(2, 0xFFFFu16 as i16)); } #[simd_test = "avx2"] unsafe fn _mm256_cmpeq_epi32() { - let a = i32x8::new(0, 1, 2, 3,4,5,6,7); - let b = i32x8::new(7,6,2,4,3, 2, 1, 0); + let a = i32x8::new(0, 1, 2, 3, 4, 5, 6, 7); + let b = i32x8::new(7, 6, 2, 4, 3, 2, 1, 0); let r = avx2::_mm256_cmpeq_epi32(a, b); assert_eq!(r, i32x8::splat(0).replace(2, 0xFFFFFFFFu32 as i32)); } @@ -2481,8 +2624,10 @@ mod tests { let a = i64x4::new(0, 1, 2, 3); let b = i64x4::new(3, 2, 2, 0); let r = avx2::_mm256_cmpeq_epi64(a, b); - assert_eq!(r, i64x4::splat(0).replace( - 2, 0xFFFFFFFFFFFFFFFFu64 as i64)); + assert_eq!( + r, + i64x4::splat(0).replace(2, 0xFFFFFFFFFFFFFFFFu64 as i64) + ); } #[simd_test = "avx2"] @@ -2514,27 +2659,33 @@ mod tests { let a = i64x4::splat(0).replace(0, 5); let b = i64x4::splat(0); let r = avx2::_mm256_cmpgt_epi64(a, b); - assert_eq!(r, i64x4::splat(0).replace( - 0, 0xFFFFFFFFFFFFFFFFu64 as i64)); + assert_eq!( + r, + i64x4::splat(0).replace(0, 0xFFFFFFFFFFFFFFFFu64 as i64) + ); } #[simd_test = "avx2"] unsafe fn _mm256_cvtepi8_epi16() { - let a = i8x16::new(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7); - let r = i16x16::new(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7); + let a = + i8x16::new(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7); + let r = + i16x16::new(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7); assert_eq!(r, avx2::_mm256_cvtepi8_epi16(a)); } #[simd_test = "avx2"] unsafe fn _mm256_cvtepi8_epi32() { - let a = i8x16::new(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7); + let a = + i8x16::new(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7); let r = i32x8::new(0, 0, -1, 1, -2, 2, -3, 3); assert_eq!(r, avx2::_mm256_cvtepi8_epi32(a)); } #[simd_test = "avx2"] unsafe fn _mm256_cvtepi8_epi64() { - let a = i8x16::new(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7); + let a = + i8x16::new(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7); let r = i64x4::new(0, 0, -1, 1); assert_eq!(r, avx2::_mm256_cvtepi8_epi64(a)); } @@ -2580,11 +2731,11 @@ mod tests { #[simd_test = "avx2"] unsafe fn _mm256_hadds_epi16() { - let a = i16x16::splat(2).replace(0,0x7FFF).replace(1,1); + let a = i16x16::splat(2).replace(0, 0x7FFF).replace(1, 1); let b = i16x16::splat(4); let r = avx2::_mm256_hadds_epi16(a, b); - let e = i16x16::new( - 0x7FFF, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8); + let e = + i16x16::new(0x7FFF, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8); assert_eq!(r, e); } @@ -2608,10 +2759,10 @@ mod tests { #[simd_test = "avx2"] unsafe fn _mm256_hsubs_epi16() { - let a = i16x16::splat(2).replace(0,0x7FFF).replace(1,-1); + let a = i16x16::splat(2).replace(0, 0x7FFF).replace(1, -1); let b = i16x16::splat(4); let r = avx2::_mm256_hsubs_epi16(a, b); - let e = i16x16::splat(0).replace(0,0x7FFF); + let e = i16x16::splat(0).replace(0, 0x7FFF); assert_eq!(r, e); } @@ -2902,11 +3053,13 @@ mod tests { let a = i16x16::splat(2); let b = i16x16::splat(4); let r = avx2::_mm256_packs_epi16(a, b); + #[cfg_attr(rustfmt, rustfmt_skip)] let e = i8x32::new( 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, - 4, 4, 4, 4, 4, 4, 4, 4); + 4, 4, 4, 4, 4, 4, 4, 4, + ); assert_eq!(r, e); } @@ -2916,11 +3069,7 @@ mod tests { let a = i32x8::splat(2); let b = i32x8::splat(4); let r = avx2::_mm256_packs_epi32(a, b); - let e = i16x16::new( - 2, 2, 2, 2, - 4, 4, 4, 4, - 2, 2, 2, 2, - 4, 4, 4, 4); + let e = i16x16::new(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4); assert_eq!(r, e); } @@ -2930,11 +3079,13 @@ mod tests { let a = i16x16::splat(2); let b = i16x16::splat(4); let r = avx2::_mm256_packus_epi16(a, b); + #[cfg_attr(rustfmt, rustfmt_skip)] let e = u8x32::new( 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, - 4, 4, 4, 4, 4, 4, 4, 4); + 4, 4, 4, 4, 4, 4, 4, 4, + ); assert_eq!(r, e); } @@ -2944,11 +3095,7 @@ mod tests { let a = i32x8::splat(2); let b = i32x8::splat(4); let r = avx2::_mm256_packus_epi32(a, b); - let e = u16x16::new( - 2, 2, 2, 2, - 4, 4, 4, 4, - 2, 2, 2, 2, - 4, 4, 4, 4); + let e = u16x16::new(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4); assert_eq!(r, e); } @@ -3017,21 +3164,24 @@ mod tests { unsafe fn _mm256_slli_epi16() { assert_eq!( avx2::_mm256_slli_epi16(i16x16::splat(0xFF), 4), - i16x16::splat(0xFF0)); + i16x16::splat(0xFF0) + ); } #[simd_test = "avx2"] unsafe fn _mm256_slli_epi32() { assert_eq!( avx2::_mm256_slli_epi32(i32x8::splat(0xFFFF), 4), - i32x8::splat(0xFFFF0)); + i32x8::splat(0xFFFF0) + ); } #[simd_test = "avx2"] unsafe fn _mm256_slli_epi64() { assert_eq!( avx2::_mm256_slli_epi64(i64x4::splat(0xFFFFFFFF), 4), - i64x4::splat(0xFFFFFFFF0)); + i64x4::splat(0xFFFFFFFF0) + ); } #[simd_test = "avx2"] @@ -3090,14 +3240,16 @@ mod tests { unsafe fn _mm256_srai_epi16() { assert_eq!( avx2::_mm256_srai_epi16(i16x16::splat(-1), 1), - i16x16::splat(-1)); + i16x16::splat(-1) + ); } #[simd_test = "avx2"] unsafe fn _mm256_srai_epi32() { assert_eq!( avx2::_mm256_srai_epi32(i32x8::splat(-1), 1), - i32x8::splat(-1)); + i32x8::splat(-1) + ); } #[simd_test = "avx2"] @@ -3106,7 +3258,7 @@ mod tests { let count = i32x4::splat(1); let r = avx2::_mm_srav_epi32(a, count); let e = i32x4::splat(2); - assert_eq!(r, e ); + assert_eq!(r, e); } #[simd_test = "avx2"] @@ -3115,7 +3267,7 @@ mod tests { let count = i32x8::splat(1); let r = avx2::_mm256_srav_epi32(a, count); let e = i32x8::splat(2); - assert_eq!(r, e ); + assert_eq!(r, e); } #[simd_test = "avx2"] @@ -3146,21 +3298,24 @@ mod tests { unsafe fn _mm256_srli_epi16() { assert_eq!( avx2::_mm256_srli_epi16(i16x16::splat(0xFF), 4), - i16x16::splat(0xF)); + i16x16::splat(0xF) + ); } #[simd_test = "avx2"] unsafe fn _mm256_srli_epi32() { assert_eq!( avx2::_mm256_srli_epi32(i32x8::splat(0xFFFF), 4), - i32x8::splat(0xFFF)); + i32x8::splat(0xFFF) + ); } #[simd_test = "avx2"] unsafe fn _mm256_srli_epi64() { assert_eq!( avx2::_mm256_srli_epi64(i64x4::splat(0xFFFFFFFF), 4), - i64x4::splat(0xFFFFFFF)); + i64x4::splat(0xFFFFFFF) + ); } #[simd_test = "avx2"] @@ -3274,41 +3429,51 @@ mod tests { #[simd_test = "avx2"] unsafe fn _mm256_alignr_epi8() { + #[cfg_attr(rustfmt, rustfmt_skip)] let a = i8x32::new( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, - 25, 26, 27, 28, 29, 30, 31, 32); + 25, 26, 27, 28, 29, 30, 31, 32, + ); + #[cfg_attr(rustfmt, rustfmt_skip)] let b = i8x32::new( -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, - -25, -26, -27, -28, -29, -30, -31, -32); + -25, -26, -27, -28, -29, -30, -31, -32, + ); let r = avx2::_mm256_alignr_epi8(a, b, 33); assert_eq!(r, i8x32::splat(0)); let r = avx2::_mm256_alignr_epi8(a, b, 17); + #[cfg_attr(rustfmt, rustfmt_skip)] let expected = i8x32::new( 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, - 26, 27, 28, 29, 30, 31, 32, 0); + 26, 27, 28, 29, 30, 31, 32, 0, + ); assert_eq!(r, expected); + #[cfg_attr(rustfmt, rustfmt_skip)] let expected = i8x32::new( -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, 1, 2, 3, 4, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 16); + 9, 10, 11, 12, 13, 14, 15, 16, + ); let r = avx2::_mm256_alignr_epi8(a, b, 16); assert_eq!(r, expected); let r = avx2::_mm256_alignr_epi8(a, b, 15); + #[cfg_attr(rustfmt, rustfmt_skip)] let expected = i8x32::new( -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15); + 8, 9, 10, 11, 12, 13, 14, 15, + ); assert_eq!(r, expected); let r = avx2::_mm256_alignr_epi8(a, b, 0); @@ -3317,18 +3482,21 @@ mod tests { #[simd_test = "avx2"] unsafe fn _mm256_shuffle_epi8() { + #[cfg_attr(rustfmt, rustfmt_skip)] let a = u8x32::new( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, - 25, 26, 27, 28, 29, 30, 31, 32 + 25, 26, 27, 28, 29, 30, 31, 32, ); + #[cfg_attr(rustfmt, rustfmt_skip)] let b = u8x32::new( 4, 128, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0, 4, 128, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0, ); + #[cfg_attr(rustfmt, rustfmt_skip)] let expected = u8x32::new( 5, 0, 5, 4, 9, 13, 7, 4, 13, 6, 6, 11, 5, 2, 9, 1, diff --git a/library/stdarch/src/x86/bmi.rs b/library/stdarch/src/x86/bmi.rs index 039318991551..2cc86d800923 100644 --- a/library/stdarch/src/x86/bmi.rs +++ b/library/stdarch/src/x86/bmi.rs @@ -1,11 +1,16 @@ //! Bit Manipulation Instruction (BMI) Set 1.0. //! //! The reference is [Intel 64 and IA-32 Architectures Software Developer's -//! Manual Volume 2: Instruction Set Reference, -//! A-Z](http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf). +//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref]. //! -//! [Wikipedia](https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#BMI1_.28Bit_Manipulation_Instruction_Set_1.29) -//! provides a quick overview of the available instructions. +//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions +//! available. +//! +//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf +//! [wikipedia_bmi]: +//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_. +//! 28Advanced_Bit_Manipulation.29 + #[cfg(test)] use stdsimd_test::assert_instr; @@ -32,8 +37,8 @@ pub unsafe fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 { /// Extracts bits of `a` specified by `control` into /// the least significant bits of the result. /// -/// Bits [7,0] of `control` specify the index to the first bit in the range to be -/// extracted, and bits [15,8] specify the length of the range. +/// Bits [7,0] of `control` specify the index to the first bit in the range to +/// be extracted, and bits [15,8] specify the length of the range. #[inline(always)] #[target_feature = "+bmi"] #[cfg_attr(test, assert_instr(bextr))] @@ -44,8 +49,8 @@ pub unsafe fn _bextr2_u32(a: u32, control: u32) -> u32 { /// Extracts bits of `a` specified by `control` into /// the least significant bits of the result. /// -/// Bits [7,0] of `control` specify the index to the first bit in the range to be -/// extracted, and bits [15,8] specify the length of the range. +/// Bits [7,0] of `control` specify the index to the first bit in the range to +/// be extracted, and bits [15,8] specify the length of the range. #[inline(always)] #[target_feature = "+bmi"] #[cfg_attr(test, assert_instr(bextr))] @@ -177,9 +182,9 @@ pub unsafe fn _mm_tzcnt_u64(x: u64) -> u64 { #[allow(dead_code)] extern "C" { - #[link_name="llvm.x86.bmi.bextr.32"] + #[link_name = "llvm.x86.bmi.bextr.32"] fn x86_bmi_bextr_32(x: u32, y: u32) -> u32; - #[link_name="llvm.x86.bmi.bextr.64"] + #[link_name = "llvm.x86.bmi.bextr.64"] fn x86_bmi_bextr_64(x: u64, y: u64) -> u64; } diff --git a/library/stdarch/src/x86/bmi2.rs b/library/stdarch/src/x86/bmi2.rs index 22cd3b4dc14e..700c6d0e436e 100644 --- a/library/stdarch/src/x86/bmi2.rs +++ b/library/stdarch/src/x86/bmi2.rs @@ -1,11 +1,15 @@ //! Bit Manipulation Instruction (BMI) Set 2.0. //! //! The reference is [Intel 64 and IA-32 Architectures Software Developer's -//! Manual Volume 2: Instruction Set Reference, -//! A-Z](http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectu res-software-developer-instruction-set-reference-manual-325383.pdf). +//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref]. //! -//! [Wikipedia](https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#BMI2_.28Bit_Manipulation_Instruction_Set_2.29) -//! provides a quick overview of the available instructions. +//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions +//! available. +//! +//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf +//! [wikipedia_bmi]: +//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_. +//! 28Advanced_Bit_Manipulation.29 #[cfg(test)] use stdsimd_test::assert_instr; @@ -96,17 +100,17 @@ pub unsafe fn _pext_u64(a: u64, mask: u64) -> u64 { #[allow(dead_code)] extern "C" { - #[link_name="llvm.x86.bmi.bzhi.32"] + #[link_name = "llvm.x86.bmi.bzhi.32"] fn x86_bmi2_bzhi_32(x: u32, y: u32) -> u32; - #[link_name="llvm.x86.bmi.bzhi.64"] + #[link_name = "llvm.x86.bmi.bzhi.64"] fn x86_bmi2_bzhi_64(x: u64, y: u64) -> u64; - #[link_name="llvm.x86.bmi.pdep.32"] + #[link_name = "llvm.x86.bmi.pdep.32"] fn x86_bmi2_pdep_32(x: u32, y: u32) -> u32; - #[link_name="llvm.x86.bmi.pdep.64"] + #[link_name = "llvm.x86.bmi.pdep.64"] fn x86_bmi2_pdep_64(x: u64, y: u64) -> u64; - #[link_name="llvm.x86.bmi.pext.32"] + #[link_name = "llvm.x86.bmi.pext.32"] fn x86_bmi2_pext_32(x: u32, y: u32) -> u32; - #[link_name="llvm.x86.bmi.pext.64"] + #[link_name = "llvm.x86.bmi.pext.64"] fn x86_bmi2_pext_64(x: u64, y: u64) -> u64; } @@ -118,7 +122,7 @@ mod tests { #[simd_test = "bmi2"] unsafe fn _pext_u32() { - let n = 0b1011_1110_1001_0011u32; + let n = 0b1011_1110_1001_0011u32; let m0 = 0b0110_0011_1000_0101u32; let s0 = 0b0000_0000_0011_0101u32; @@ -133,7 +137,7 @@ mod tests { #[simd_test = "bmi2"] #[cfg(not(target_arch = "x86"))] unsafe fn _pext_u64() { - let n = 0b1011_1110_1001_0011u64; + let n = 0b1011_1110_1001_0011u64; let m0 = 0b0110_0011_1000_0101u64; let s0 = 0b0000_0000_0011_0101u64; @@ -147,7 +151,7 @@ mod tests { #[simd_test = "bmi2"] unsafe fn _pdep_u32() { - let n = 0b1011_1110_1001_0011u32; + let n = 0b1011_1110_1001_0011u32; let m0 = 0b0110_0011_1000_0101u32; let s0 = 0b0000_0010_0000_0101u32; @@ -162,7 +166,7 @@ mod tests { #[simd_test = "bmi2"] #[cfg(not(target_arch = "x86"))] unsafe fn _pdep_u64() { - let n = 0b1011_1110_1001_0011u64; + let n = 0b1011_1110_1001_0011u64; let m0 = 0b0110_0011_1000_0101u64; let s0 = 0b0000_0010_0000_0101u64; @@ -194,23 +198,31 @@ mod tests { let a: u32 = 4_294_967_200; let b: u32 = 2; let (lo, hi): (u32, u32) = bmi2::_mulx_u32(a, b); - // result = 8589934400 - // = 0b0001_1111_1111_1111_1111_1111_1111_0100_0000u64 - // ^~hi ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + /* +result = 8589934400 + = 0b0001_1111_1111_1111_1111_1111_1111_0100_0000u64 + ^~hi ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ assert_eq!(lo, 0b1111_1111_1111_1111_1111_1111_0100_0000u32); assert_eq!(hi, 0b0001u32); } #[simd_test = "bmi2"] #[cfg(not(target_arch = "x86"))] + #[cfg_attr(rustfmt, rustfmt_skip)] unsafe fn _mulx_u64() { let a: u64 = 9_223_372_036_854_775_800; let b: u64 = 100; let (lo, hi): (u64, u64) = bmi2::_mulx_u64(a, b); - // result = 922337203685477580000 - // = 0b00110001_11111111_11111111_11111111_11111111_11111111_11111111_11111100_11100000u128 - // ^~hi~~~~ ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - assert_eq!(lo, 0b11111111_11111111_11111111_11111111_11111111_11111111_11111100_11100000u64); + /* +result = 922337203685477580000 = +0b00110001_1111111111111111_1111111111111111_1111111111111111_1111110011100000 + ^~hi~~~~ ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + assert_eq!( + lo, + 0b11111111_11111111_11111111_11111111_11111111_11111111_11111100_11100000u64 + ); assert_eq!(hi, 0b00110001u64); } } diff --git a/library/stdarch/src/x86/macros.rs b/library/stdarch/src/x86/macros.rs index 46b3f97d8a60..e835fe210709 100644 --- a/library/stdarch/src/x86/macros.rs +++ b/library/stdarch/src/x86/macros.rs @@ -348,5 +348,3 @@ macro_rules! assert_approx_eq { *a, *b, $eps, (*a - *b).abs()); }) } - - diff --git a/library/stdarch/src/x86/mod.rs b/library/stdarch/src/x86/mod.rs index 6dbdb3e247e4..a046c453c766 100644 --- a/library/stdarch/src/x86/mod.rs +++ b/library/stdarch/src/x86/mod.rs @@ -12,7 +12,7 @@ pub use self::bmi::*; pub use self::bmi2::*; pub use self::tbm::*; -pub use self::runtime::{__Feature, __unstable_detect_feature}; +pub use self::runtime::{__unstable_detect_feature, __Feature}; #[allow(non_camel_case_types)] pub type __m128i = ::v128::i8x16; diff --git a/library/stdarch/src/x86/runtime.rs b/library/stdarch/src/x86/runtime.rs index 4071a1ce587d..5d44c8793435 100644 --- a/library/stdarch/src/x86/runtime.rs +++ b/library/stdarch/src/x86/runtime.rs @@ -1,9 +1,9 @@ //! This module implements minimal run-time feature detection for x86. //! -//! The features are detected using the `detect_features` function below. This function -//! uses the CPUID instruction to read the feature flags from the CPU and encodes them in -//! an `usize` where each bit position represents whether a feature is available (bit is set) -//! or unavaiable (bit is cleared). +//! The features are detected using the `detect_features` function below. +//! This function uses the CPUID instruction to read the feature flags from the +//! CPU and encodes them in an `usize` where each bit position represents +//! whether a feature is available (bit is set) or unavaiable (bit is cleared). //! //! The enum `__Feature` is used to map bit positions to feature names, and the //! the `__unstable_detect_feature!` macro is used to map string literals (e.g. @@ -12,10 +12,10 @@ //! //! The run-time feature detection is performed by the //! `__unstable_detect_feature(__Feature) -> bool` function. On its first call, -//! this functions queries the CPU for the available features and stores them in -//! a global `AtomicUsize` variable. The query is performed by just checking whether the -//! feature bit in this global variable is set or cleared. -use ::std::sync::atomic::{AtomicUsize, Ordering}; +//! this functions queries the CPU for the available features and stores them +//! in a global `AtomicUsize` variable. The query is performed by just checking +//! whether the feature bit in this global variable is set or cleared. +use std::sync::atomic::{AtomicUsize, Ordering}; /// This macro maps the string-literal feature names to values of the /// `__Feature` enum at compile-time. The feature names used are the same as @@ -26,22 +26,68 @@ use ::std::sync::atomic::{AtomicUsize, Ordering}; #[macro_export] #[doc(hidden)] macro_rules! __unstable_detect_feature { - ("sse") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::sse{}) }; - ("sse2") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::sse2{}) }; - ("sse3") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::sse3{}) }; - ("ssse3") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::ssse3{}) }; - ("sse4.1") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::sse4_1{}) }; - ("sse4.2") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::sse4_2{}) }; - ("avx") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::avx{}) }; - ("avx2") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::avx2{}) }; - ("fma") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::fma{}) }; - ("bmi") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::bmi{}) }; - ("bmi2") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::bmi2{}) }; - ("abm") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::abm{}) }; - ("lzcnt") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::abm{}) }; - ("tbm") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::tbm{}) }; - ("popcnt") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::popcnt{}) }; - ($t:tt) => { compile_error!(concat!("unknown target feature: ", $t)) }; + ("sse") => { + $crate::vendor::__unstable_detect_feature( + $crate::vendor::__Feature::sse{}) }; + ("sse2") => { + $crate::vendor::__unstable_detect_feature( + $crate::vendor::__Feature::sse2{}) + }; + ("sse3") => { + $crate::vendor::__unstable_detect_feature( + $crate::vendor::__Feature::sse3{}) + }; + ("ssse3") => { + $crate::vendor::__unstable_detect_feature( + $crate::vendor::__Feature::ssse3{}) + }; + ("sse4.1") => { + $crate::vendor::__unstable_detect_feature( + $crate::vendor::__Feature::sse4_1{}) + }; + ("sse4.2") => { + $crate::vendor::__unstable_detect_feature( + $crate::vendor::__Feature::sse4_2{}) + }; + ("avx") => { + $crate::vendor::__unstable_detect_feature( + $crate::vendor::__Feature::avx{}) + }; + ("avx2") => { + $crate::vendor::__unstable_detect_feature( + $crate::vendor::__Feature::avx2{}) + }; + ("fma") => { + $crate::vendor::__unstable_detect_feature( + $crate::vendor::__Feature::fma{}) + }; + ("bmi") => { + $crate::vendor::__unstable_detect_feature( + $crate::vendor::__Feature::bmi{}) + }; + ("bmi2") => { + $crate::vendor::__unstable_detect_feature( + $crate::vendor::__Feature::bmi2{}) + }; + ("abm") => { + $crate::vendor::__unstable_detect_feature( + $crate::vendor::__Feature::abm{}) + }; + ("lzcnt") => { + $crate::vendor::__unstable_detect_feature( + $crate::vendor::__Feature::abm{}) + }; + ("tbm") => { + $crate::vendor::__unstable_detect_feature( + $crate::vendor::__Feature::tbm{}) + }; + ("popcnt") => { + $crate::vendor::__unstable_detect_feature( + $crate::vendor::__Feature::popcnt{}) + }; + ($t:tt) => { + compile_error!(concat!("unknown target feature: ", $t)) + }; } /// X86 CPU Feature enum. Each variant denotes a position in a bitset for a @@ -74,15 +120,15 @@ pub enum __Feature { bmi, /// BMI1 (Bit Manipulation Instructions 2) bmi2, - /// ABM (Advanced Bit Manipulation) on AMD / LZCNT (Leading Zero Count) on Intel + /// ABM (Advanced Bit Manipulation) on AMD / LZCNT (Leading Zero + /// Count) on Intel abm, /// TBM (Trailing Bit Manipulation) tbm, /// POPCNT (Population Count) popcnt, - #[doc(hidden)] - __NonExhaustive + #[doc(hidden)] __NonExhaustive, } fn set_bit(x: usize, bit: u32) -> usize { @@ -102,14 +148,19 @@ fn inv_test_bit(v: usize, idx: u32) -> bool { /// Run-time feature detection on x86 works by using the CPUID instruction. /// -/// The [CPUID Wikipedia page](https://en.wikipedia.org/wiki/CPUID) contains all -/// the information about which flags to set to query which values, and in which -/// registers these are reported. +/// The [CPUID Wikipedia page][wiki_cpuid] contains +/// all the information about which flags to set to query which values, and in +/// which registers these are reported. /// /// The definitive references are: -/// - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2: Instruction Set Reference, A-Z](http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf). -/// - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and System Instructions](http://support.amd.com/TechDocs/24594.pdf). +/// - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2: +/// Instruction Set Reference, A-Z][intel64_ref]. +/// - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and +/// System Instructions][amd64_ref]. /// +/// [wiki_cpuid]: https://en.wikipedia.org/wiki/CPUID +/// [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf +/// [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf fn detect_features() -> usize { let ebx; let ecx; @@ -119,14 +170,16 @@ fn detect_features() -> usize { /// To obtain all feature flags we need two CPUID queries: /// 1. EAX=1, ECX=0: Queries "Processor Info and Feature Bits" - /// This gives us most of the CPU features in ECX and EDX (see below), + /// This gives us most of the CPU features in ECX and EDX (see + /// below), asm!("cpuid" : "={ecx}"(ecx), "={edx}"(edx) : "{eax}"(0x00000001u32), "{ecx}"(0 as u32) : :); /// 2. EAX=7, ECX=0: Queries "Extended Features" - /// This gives us information about bmi,bmi2, and avx2 support (see below). + /// This gives us information about bmi,bmi2, and avx2 support + /// (see below). asm!("cpuid" : "={ebx}"(ebx) : "{eax}"(0x00000007u32), "{ecx}"(0 as u32) @@ -135,36 +188,65 @@ fn detect_features() -> usize { let mut value: usize = 0; - // CPUID call with EAX=7, ECX=0 => Extended Features in EBX and ECX (unneeded): - if inv_test_bit(ebx, 3) { value = set_bit(value, __Feature::bmi as u32); } - if inv_test_bit(ebx, 5) { value = set_bit(value, __Feature::avx2 as u32); } - if inv_test_bit(ebx, 8) { value = set_bit(value, __Feature::bmi2 as u32); } + // CPUID call with EAX=7, ECX=0 => Extended Features in EBX and ECX + // (the result in ECX is not currently needed): + if inv_test_bit(ebx, 3) { + value = set_bit(value, __Feature::bmi as u32); + } + if inv_test_bit(ebx, 5) { + value = set_bit(value, __Feature::avx2 as u32); + } + if inv_test_bit(ebx, 8) { + value = set_bit(value, __Feature::bmi2 as u32); + } // CPUID call with EAX=1 => feature bits in ECX and EDX: - if inv_test_bit(ecx, 0) { value = set_bit(value, __Feature::sse3 as u32); } - if inv_test_bit(ecx, 5) { value = set_bit(value, __Feature::abm as u32); } - if inv_test_bit(ecx, 9) { value = set_bit(value, __Feature::ssse3 as u32); } - if inv_test_bit(ecx, 12) { value = set_bit(value, __Feature::fma as u32); } - if inv_test_bit(ecx, 19) { value = set_bit(value, __Feature::sse4_1 as u32); } - if inv_test_bit(ecx, 20) { value = set_bit(value, __Feature::sse4_2 as u32); } - if inv_test_bit(ecx, 21) { value = set_bit(value, __Feature::tbm as u32); } - if inv_test_bit(ecx, 23) { value = set_bit(value, __Feature::popcnt as u32); } - if inv_test_bit(ecx, 28) { value = set_bit(value, __Feature::avx as u32); } + if inv_test_bit(ecx, 0) { + value = set_bit(value, __Feature::sse3 as u32); + } + if inv_test_bit(ecx, 5) { + value = set_bit(value, __Feature::abm as u32); + } + if inv_test_bit(ecx, 9) { + value = set_bit(value, __Feature::ssse3 as u32); + } + if inv_test_bit(ecx, 12) { + value = set_bit(value, __Feature::fma as u32); + } + if inv_test_bit(ecx, 19) { + value = set_bit(value, __Feature::sse4_1 as u32); + } + if inv_test_bit(ecx, 20) { + value = set_bit(value, __Feature::sse4_2 as u32); + } + if inv_test_bit(ecx, 21) { + value = set_bit(value, __Feature::tbm as u32); + } + if inv_test_bit(ecx, 23) { + value = set_bit(value, __Feature::popcnt as u32); + } + if inv_test_bit(ecx, 28) { + value = set_bit(value, __Feature::avx as u32); + } - if inv_test_bit(edx, 25) { value = set_bit(value, __Feature::sse as u32); } - if inv_test_bit(edx, 26) { value = set_bit(value, __Feature::sse2 as u32); } + if inv_test_bit(edx, 25) { + value = set_bit(value, __Feature::sse as u32); + } + if inv_test_bit(edx, 26) { + value = set_bit(value, __Feature::sse2 as u32); + } value } -/// This global variable is a bitset used to cache the features supported by the -/// CPU. +/// This global variable is a bitset used to cache the features supported by +/// the CPU. static FEATURES: AtomicUsize = AtomicUsize::new(::std::usize::MAX); /// Performs run-time feature detection. /// -/// On its first invocation, it detects the CPU features and caches them in the -/// `FEATURES` global variable as an `AtomicUsize`. +/// On its first invocation, it detects the CPU features and caches them +/// in the `FEATURES` global variable as an `AtomicUsize`. /// /// It uses the `__Feature` variant to index into this variable as a bitset. If /// the bit is set, the feature is enabled, and otherwise it is disabled. @@ -172,7 +254,7 @@ static FEATURES: AtomicUsize = AtomicUsize::new(::std::usize::MAX); /// PLEASE: do not use this, it is an implementation detail subject to change. #[doc(hidden)] pub fn __unstable_detect_feature(x: __Feature) -> bool { - if FEATURES.load(Ordering::Relaxed) == ::std::usize::MAX { + if FEATURES.load(Ordering::Relaxed) == ::std::usize::MAX { FEATURES.store(detect_features(), Ordering::Relaxed); } test_bit(FEATURES.load(Ordering::Relaxed), x as u32) diff --git a/library/stdarch/src/x86/sse.rs b/library/stdarch/src/x86/sse.rs index f6f45a34de5f..5633d39c0e48 100644 --- a/library/stdarch/src/x86/sse.rs +++ b/library/stdarch/src/x86/sse.rs @@ -173,21 +173,23 @@ pub unsafe fn _mm_max_ps(a: f32x4, b: f32x4) -> f32x4 { #[target_feature = "+sse"] // i586 only seems to generate plain `and` instructions, so ignore it. #[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")), - assert_instr(andps))] + assert_instr(andps))] pub unsafe fn _mm_and_ps(a: f32x4, b: f32x4) -> f32x4 { let aa: i32x4 = mem::transmute(a); let bb: i32x4 = mem::transmute(b); mem::transmute(aa & bb) } -/// Bitwise AND-NOT of packed single-precision (32-bit) floating-point elements. +/// Bitwise AND-NOT of packed single-precision (32-bit) floating-point +/// elements. /// /// Computes `!a & b` for each bit in `a` and `b`. #[inline(always)] #[target_feature = "+sse"] -// i586 only seems to generate plain `not` and `and` instructions, so ignore it. +// i586 only seems to generate plain `not` and `and` instructions, so ignore +// it. #[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")), - assert_instr(andnps))] + assert_instr(andnps))] pub unsafe fn _mm_andnot_ps(a: f32x4, b: f32x4) -> f32x4 { let aa: i32x4 = mem::transmute(a); let bb: i32x4 = mem::transmute(b); @@ -199,7 +201,7 @@ pub unsafe fn _mm_andnot_ps(a: f32x4, b: f32x4) -> f32x4 { #[target_feature = "+sse"] // i586 only seems to generate plain `or` instructions, so we ignore it. #[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")), - assert_instr(orps))] + assert_instr(orps))] pub unsafe fn _mm_or_ps(a: f32x4, b: f32x4) -> f32x4 { let aa: i32x4 = mem::transmute(a); let bb: i32x4 = mem::transmute(b); @@ -212,7 +214,7 @@ pub unsafe fn _mm_or_ps(a: f32x4, b: f32x4) -> f32x4 { #[target_feature = "+sse"] // i586 only seems to generate plain `xor` instructions, so we ignore it. #[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")), - assert_instr(xorps))] + assert_instr(xorps))] pub unsafe fn _mm_xor_ps(a: f32x4, b: f32x4) -> f32x4 { let aa: i32x4 = mem::transmute(a); let bb: i32x4 = mem::transmute(b); @@ -229,8 +231,8 @@ pub unsafe fn _mm_cmpeq_ss(a: f32x4, b: f32x4) -> f32x4 { cmpss(a, b, 0) } -/// Compare the lowest `f32` of both inputs for less than. The lowest 32 bits of -/// the result will be `0xffffffff` if `a.extract(0)` is less than +/// Compare the lowest `f32` of both inputs for less than. The lowest 32 bits +/// of the result will be `0xffffffff` if `a.extract(0)` is less than /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the /// upper 96 bits of `a`. #[inline(always)] @@ -241,9 +243,9 @@ pub unsafe fn _mm_cmplt_ss(a: f32x4, b: f32x4) -> f32x4 { } /// Compare the lowest `f32` of both inputs for less than or equal. The lowest -/// 32 bits of the result will be `0xffffffff` if `a.extract(0)` is less than or -/// equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are -/// the upper 96 bits of `a`. +/// 32 bits of the result will be `0xffffffff` if `a.extract(0)` is less than +/// or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result +/// are the upper 96 bits of `a`. #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(cmpless))] @@ -251,10 +253,10 @@ pub unsafe fn _mm_cmple_ss(a: f32x4, b: f32x4) -> f32x4 { cmpss(a, b, 2) } -/// Compare the lowest `f32` of both inputs for greater than. The lowest 32 bits of -/// the result will be `0xffffffff` if `a.extract(0)` is greater than -/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the -/// upper 96 bits of `a`. +/// Compare the lowest `f32` of both inputs for greater than. The lowest 32 +/// bits of the result will be `0xffffffff` if `a.extract(0)` is greater +/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result +/// are the upper 96 bits of `a`. #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(cmpltss))] @@ -264,8 +266,8 @@ pub unsafe fn _mm_cmpgt_ss(a: f32x4, b: f32x4) -> f32x4 { /// Compare the lowest `f32` of both inputs for greater than or equal. The /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is -/// greater than or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of -/// the result are the upper 96 bits of `a`. +/// greater than or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits +/// of the result are the upper 96 bits of `a`. #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(cmpless))] @@ -297,8 +299,8 @@ pub unsafe fn _mm_cmpnlt_ss(a: f32x4, b: f32x4) -> f32x4 { /// Compare the lowest `f32` of both inputs for not-less-than-or-equal. The /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not -/// less than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits of -/// the result are the upper 96 bits of `a`. +/// less than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits +/// of the result are the upper 96 bits of `a`. #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(cmpnless))] @@ -319,8 +321,8 @@ pub unsafe fn _mm_cmpngt_ss(a: f32x4, b: f32x4) -> f32x4 { /// Compare the lowest `f32` of both inputs for not-greater-than-or-equal. The /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not -/// greater than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits -/// of the result are the upper 96 bits of `a`. +/// greater than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 +/// bits of the result are the upper 96 bits of `a`. #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(cmpnless))] @@ -361,8 +363,8 @@ pub unsafe fn _mm_cmpeq_ps(a: f32x4, b: f32x4) -> f32x4 { } /// Compare each of the four floats in `a` to the corresponding element in `b`. -/// The result in the output vector will be `0xffffffff` if the input element in -/// `a` is less than the corresponding element in `b`, or `0` otherwise. +/// The result in the output vector will be `0xffffffff` if the input element +/// in `a` is less than the corresponding element in `b`, or `0` otherwise. #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(cmpltps))] @@ -371,8 +373,8 @@ pub unsafe fn _mm_cmplt_ps(a: f32x4, b: f32x4) -> f32x4 { } /// Compare each of the four floats in `a` to the corresponding element in `b`. -/// The result in the output vector will be `0xffffffff` if the input element in -/// `a` is less than or equal to the corresponding element in `b`, or `0` +/// The result in the output vector will be `0xffffffff` if the input element +/// in `a` is less than or equal to the corresponding element in `b`, or `0` /// otherwise. #[inline(always)] #[target_feature = "+sse"] @@ -382,8 +384,8 @@ pub unsafe fn _mm_cmple_ps(a: f32x4, b: f32x4) -> f32x4 { } /// Compare each of the four floats in `a` to the corresponding element in `b`. -/// The result in the output vector will be `0xffffffff` if the input element in -/// `a` is greater than the corresponding element in `b`, or `0` otherwise. +/// The result in the output vector will be `0xffffffff` if the input element +/// in `a` is greater than the corresponding element in `b`, or `0` otherwise. #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(cmpltps))] @@ -392,8 +394,8 @@ pub unsafe fn _mm_cmpgt_ps(a: f32x4, b: f32x4) -> f32x4 { } /// Compare each of the four floats in `a` to the corresponding element in `b`. -/// The result in the output vector will be `0xffffffff` if the input element in -/// `a` is greater than or equal to the corresponding element in `b`, or `0` +/// The result in the output vector will be `0xffffffff` if the input element +/// in `a` is greater than or equal to the corresponding element in `b`, or `0` /// otherwise. #[inline(always)] #[target_feature = "+sse"] @@ -413,8 +415,9 @@ pub unsafe fn _mm_cmpneq_ps(a: f32x4, b: f32x4) -> f32x4 { } /// Compare each of the four floats in `a` to the corresponding element in `b`. -/// The result in the output vector will be `0xffffffff` if the input element in -/// `a` is *not* less than the corresponding element in `b`, or `0` otherwise. +/// The result in the output vector will be `0xffffffff` if the input element +/// in `a` is *not* less than the corresponding element in `b`, or `0` +/// otherwise. #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(cmpnltps))] @@ -423,9 +426,9 @@ pub unsafe fn _mm_cmpnlt_ps(a: f32x4, b: f32x4) -> f32x4 { } /// Compare each of the four floats in `a` to the corresponding element in `b`. -/// The result in the output vector will be `0xffffffff` if the input element in -/// `a` is *not* less than or equal to the corresponding element in `b`, or `0` -/// otherwise. +/// The result in the output vector will be `0xffffffff` if the input element +/// in `a` is *not* less than or equal to the corresponding element in `b`, or +/// `0` otherwise. #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(cmpnleps))] @@ -434,8 +437,8 @@ pub unsafe fn _mm_cmpnle_ps(a: f32x4, b: f32x4) -> f32x4 { } /// Compare each of the four floats in `a` to the corresponding element in `b`. -/// The result in the output vector will be `0xffffffff` if the input element in -/// `a` is *not* greater than the corresponding element in `b`, or `0` +/// The result in the output vector will be `0xffffffff` if the input element +/// in `a` is *not* greater than the corresponding element in `b`, or `0` /// otherwise. #[inline(always)] #[target_feature = "+sse"] @@ -445,9 +448,9 @@ pub unsafe fn _mm_cmpngt_ps(a: f32x4, b: f32x4) -> f32x4 { } /// Compare each of the four floats in `a` to the corresponding element in `b`. -/// The result in the output vector will be `0xffffffff` if the input element in -/// `a` is *not* greater than or equal to the corresponding element in `b`, or -/// `0` otherwise. +/// The result in the output vector will be `0xffffffff` if the input element +/// in `a` is *not* greater than or equal to the corresponding element in `b`, +/// or `0` otherwise. #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(cmpnleps))] @@ -642,7 +645,8 @@ pub unsafe fn _mm_cvtss_si64(a: f32x4) -> i64 { // pub unsafe fn _mm_cvtps_pi32(a: f32x4) -> i32x2 // pub unsafe fn _mm_cvt_ps2pi(a: f32x4) -> i32x2 { _mm_cvtps_pi32(a) } -/// Convert the lowest 32 bit float in the input vector to a 32 bit integer with +/// Convert the lowest 32 bit float in the input vector to a 32 bit integer +/// with /// truncation. /// /// The result is rounded always using truncation (round towards zero). If the @@ -666,8 +670,8 @@ pub unsafe fn _mm_cvtt_ss2si(a: f32x4) -> i32 { _mm_cvttss_si32(a) } -/// Convert the lowest 32 bit float in the input vector to a 64 bit integer with -/// truncation. +/// Convert the lowest 32 bit float in the input vector to a 64 bit integer +/// with truncation. /// /// The result is rounded always using truncation (round towards zero). If the /// result cannot be represented as a 64 bit integer the result will be @@ -765,8 +769,8 @@ pub unsafe fn _mm_set_ps1(a: f32) -> f32x4 { /// Construct a `f32x4` from four floating point values highest to lowest. /// -/// Note that `a` will be the highest 32 bits of the result, and `d` the lowest. -/// This matches the standard way of writing bit patterns on x86: +/// Note that `a` will be the highest 32 bits of the result, and `d` the +/// lowest. This matches the standard way of writing bit patterns on x86: /// /// ```text /// bit 127 .. 96 95 .. 64 63 .. 32 31 .. 0 @@ -884,8 +888,8 @@ pub unsafe fn _mm_unpacklo_ps(a: f32x4, b: f32x4) -> f32x4 { simd_shuffle4(a, b, [0, 4, 1, 5]) } -/// Combine higher half of `a` and `b`. The highwe half of `b` occupies the lower -/// half of result. +/// Combine higher half of `a` and `b`. The highwe half of `b` occupies the +/// lower half of result. #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(all(test, not(windows)), assert_instr(movhlps))] @@ -895,8 +899,8 @@ pub unsafe fn _mm_movehl_ps(a: f32x4, b: f32x4) -> f32x4 { simd_shuffle4(a, b, [6, 7, 2, 3]) } -/// Combine lower half of `a` and `b`. The lower half of `b` occupies the higher -/// half of result. +/// Combine lower half of `a` and `b`. The lower half of `b` occupies the +/// higher half of result. #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(all(test, target_feature = "sse2"), assert_instr(unpcklpd))] @@ -957,9 +961,9 @@ pub unsafe fn _mm_movemask_ps(a: f32x4) -> i32 { // 32-bit codegen does not generate `movhps` or `movhpd`, but instead // `movsd` followed by `unpcklpd` (or `movss'/`unpcklps` if there's no SSE2). #[cfg_attr(all(test, target_arch = "x86", target_feature = "sse2"), - assert_instr(unpcklpd))] + assert_instr(unpcklpd))] #[cfg_attr(all(test, target_arch = "x86", not(target_feature = "sse2")), - assert_instr(unpcklps))] + assert_instr(unpcklps))] // TODO: This function is actually not limited to floats, but that's what // what matches the C type most closely: (__m128, *const __m64) -> __m128 pub unsafe fn _mm_loadh_pi(a: f32x4, p: *const f32) -> f32x4 { @@ -1008,10 +1012,10 @@ pub unsafe fn _mm_loadh_pi(a: f32x4, p: *const f32) -> f32x4 { #[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movlpd))] // On 32-bit targets with SSE2, it just generates two `movsd`. #[cfg_attr(all(test, target_arch = "x86", target_feature = "sse2"), - assert_instr(movsd))] + assert_instr(movsd))] // It should really generate "movlps", but oh well... #[cfg_attr(all(test, target_arch = "x86", not(target_feature = "sse2")), - assert_instr(movss))] + assert_instr(movss))] // TODO: Like _mm_loadh_pi, this also isn't limited to floats. pub unsafe fn _mm_loadl_pi(a: f32x4, p: *const f32) -> f32x4 { let q = p as *const f32x2; @@ -1056,7 +1060,8 @@ pub unsafe fn _mm_load_ps1(p: *const f32) -> f32x4 { /// is not aligned to a 128-bit boundary (16 bytes) a general protection fault /// will be triggered (fatal program crash). /// -/// Use [`_mm_loadu_ps`](fn._mm_loadu_ps.html) for potentially unaligned memory. +/// Use [`_mm_loadu_ps`](fn._mm_loadu_ps.html) for potentially unaligned +/// memory. /// /// This corresponds to instructions `VMOVAPS` / `MOVAPS`. #[inline(always)] @@ -1066,8 +1071,10 @@ pub unsafe fn _mm_load_ps(p: *const f32) -> f32x4 { *(p as *const f32x4) } -/// Load four `f32` values from memory into a `f32x4`. There are no restrictions -/// on memory alignment. For aligned memory [`_mm_load_ps`](fn._mm_load_ps.html) +/// Load four `f32` values from memory into a `f32x4`. There are no +/// restrictions +/// on memory alignment. For aligned memory +/// [`_mm_load_ps`](fn._mm_load_ps.html) /// may be faster. /// /// This corresponds to instructions `VMOVUPS` / `MOVUPS`. @@ -1081,7 +1088,8 @@ pub unsafe fn _mm_loadu_ps(p: *const f32) -> f32x4 { ptr::copy_nonoverlapping( p as *const u8, &mut dst as *mut f32x4 as *mut u8, - mem::size_of::()); + mem::size_of::(), + ); dst } @@ -1117,10 +1125,11 @@ pub unsafe fn _mm_loadr_ps(p: *const f32) -> f32x4 { /// choose to generate an equivalent sequence of other instructions. #[inline(always)] #[target_feature = "+sse"] -// On i686 and up LLVM actually generates MOVHPD instead of MOVHPS, that's fine. +// On i686 and up LLVM actually generates MOVHPD instead of MOVHPS, that's +// fine. // On i586 (no SSE2) it just generates plain MOV instructions. #[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")), - assert_instr(movhpd))] + assert_instr(movhpd))] pub unsafe fn _mm_storeh_pi(p: *mut u64, a: f32x4) { if cfg!(target_arch = "x86") { // If this is a `f64x2` then on i586, LLVM generates fldl & fstpl which @@ -1128,7 +1137,8 @@ pub unsafe fn _mm_storeh_pi(p: *mut u64, a: f32x4) { let a64: u64x2 = mem::transmute(a); let a_hi = a64.extract(1); *p = mem::transmute(a_hi); - } else { // target_arch = "x86_64" + } else { + // target_arch = "x86_64" // If this is a `u64x2` LLVM generates a pshufd + movq, but we really // want a a MOVHPD or MOVHPS here. let a64: f64x2 = mem::transmute(a); @@ -1146,11 +1156,11 @@ pub unsafe fn _mm_storeh_pi(p: *mut u64, a: f32x4) { // On i586 the codegen just generates plane MOVs. No need to test for that. #[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2"), not(target_family = "windows")), - assert_instr(movlps))] + assert_instr(movlps))] // Win64 passes `a` by reference, which causes it to generate two 64 bit moves. #[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2"), target_family = "windows"), - assert_instr(movsd))] + assert_instr(movsd))] pub unsafe fn _mm_storel_pi(p: *mut u64, a: f32x4) { if cfg!(target_arch = "x86") { // Same as for _mm_storeh_pi: i586 code gen would use floating point @@ -1158,7 +1168,8 @@ pub unsafe fn _mm_storel_pi(p: *mut u64, a: f32x4) { let a64: u64x2 = mem::transmute(a); let a_hi = a64.extract(0); *p = mem::transmute(a_hi); - } else { // target_arch = "x86_64" + } else { + // target_arch = "x86_64" let a64: f64x2 = mem::transmute(a); let a_hi = a64.extract(0); *p = mem::transmute(a_hi); @@ -1235,7 +1246,8 @@ pub unsafe fn _mm_storeu_ps(p: *mut f32, a: f32x4) { ptr::copy_nonoverlapping( &a as *const f32x4 as *const u8, p as *mut u8, - mem::size_of::()); + mem::size_of::(), + ); } /// Store four 32-bit floats into *aligned* memory in reverse order. @@ -1309,7 +1321,8 @@ pub unsafe fn _mm_getcsr() -> u32 { /// * *Exception flags* report which exceptions occurred since last they were /// reset. /// -/// * *Masking flags* can be used to mask (ignore) certain exceptions. By default +/// * *Masking flags* can be used to mask (ignore) certain exceptions. By +/// default /// these flags are all set to 1, so all exceptions are masked. When an /// an exception is masked, the processor simply sets the exception flag and /// continues the operation. If the exception is unmasked, the flag is also set @@ -1332,11 +1345,13 @@ pub unsafe fn _mm_getcsr() -> u32 { /// * `_MM_EXCEPT_DIV_ZERO`: Division by zero occured. /// /// * `_MM_EXCEPT_OVERFLOW`: A numeric overflow exception occured, i.e., a -/// result was too large to be represented (e.g., an `f32` with absolute value +/// result was too large to be represented (e.g., an `f32` with absolute +/// value /// greater than `2^128`). /// /// * `_MM_EXCEPT_UNDERFLOW`: A numeric underflow exception occured, i.e., a -/// result was too small to be represented in a normalized way (e.g., an `f32` +/// result was too small to be represented in a normalized way (e.g., an +/// `f32` /// with absulte value smaller than `2^-126`.) /// /// * `_MM_EXCEPT_INEXACT`: An inexact-result exception occured (a.k.a. @@ -1374,10 +1389,12 @@ pub unsafe fn _mm_getcsr() -> u32 { /// exception, use: /// /// ```rust,ignore -/// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow exception +/// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow +/// exception /// ``` /// -/// Warning: an unmasked exception will cause an exception handler to be called. +/// Warning: an unmasked exception will cause an exception handler to be +/// called. /// The standard handler will simply terminate the process. So, in this case /// any underflow exception would terminate the current process with something /// like `signal: 8, SIGFPE: erroneous arithmetic operation`. @@ -1427,48 +1444,48 @@ pub unsafe fn _mm_setcsr(val: u32) { } /// See [`_mm_setcsr`](fn._mm_setcsr.html) -pub const _MM_EXCEPT_INVALID: u32 = 0x0001; +pub const _MM_EXCEPT_INVALID: u32 = 0x0001; /// See [`_mm_setcsr`](fn._mm_setcsr.html) -pub const _MM_EXCEPT_DENORM: u32 = 0x0002; +pub const _MM_EXCEPT_DENORM: u32 = 0x0002; /// See [`_mm_setcsr`](fn._mm_setcsr.html) -pub const _MM_EXCEPT_DIV_ZERO: u32 = 0x0004; +pub const _MM_EXCEPT_DIV_ZERO: u32 = 0x0004; /// See [`_mm_setcsr`](fn._mm_setcsr.html) -pub const _MM_EXCEPT_OVERFLOW: u32 = 0x0008; +pub const _MM_EXCEPT_OVERFLOW: u32 = 0x0008; /// See [`_mm_setcsr`](fn._mm_setcsr.html) -pub const _MM_EXCEPT_UNDERFLOW: u32 = 0x0010; +pub const _MM_EXCEPT_UNDERFLOW: u32 = 0x0010; /// See [`_mm_setcsr`](fn._mm_setcsr.html) -pub const _MM_EXCEPT_INEXACT: u32 = 0x0020; -pub const _MM_EXCEPT_MASK: u32 = 0x003f; +pub const _MM_EXCEPT_INEXACT: u32 = 0x0020; +pub const _MM_EXCEPT_MASK: u32 = 0x003f; /// See [`_mm_setcsr`](fn._mm_setcsr.html) -pub const _MM_MASK_INVALID: u32 = 0x0080; +pub const _MM_MASK_INVALID: u32 = 0x0080; /// See [`_mm_setcsr`](fn._mm_setcsr.html) -pub const _MM_MASK_DENORM: u32 = 0x0100; +pub const _MM_MASK_DENORM: u32 = 0x0100; /// See [`_mm_setcsr`](fn._mm_setcsr.html) -pub const _MM_MASK_DIV_ZERO: u32 = 0x0200; +pub const _MM_MASK_DIV_ZERO: u32 = 0x0200; /// See [`_mm_setcsr`](fn._mm_setcsr.html) -pub const _MM_MASK_OVERFLOW: u32 = 0x0400; +pub const _MM_MASK_OVERFLOW: u32 = 0x0400; /// See [`_mm_setcsr`](fn._mm_setcsr.html) -pub const _MM_MASK_UNDERFLOW: u32 = 0x0800; +pub const _MM_MASK_UNDERFLOW: u32 = 0x0800; /// See [`_mm_setcsr`](fn._mm_setcsr.html) -pub const _MM_MASK_INEXACT: u32 = 0x1000; -pub const _MM_MASK_MASK: u32 = 0x1f80; +pub const _MM_MASK_INEXACT: u32 = 0x1000; +pub const _MM_MASK_MASK: u32 = 0x1f80; /// See [`_mm_setcsr`](fn._mm_setcsr.html) -pub const _MM_ROUND_NEAREST: u32 = 0x0000; +pub const _MM_ROUND_NEAREST: u32 = 0x0000; /// See [`_mm_setcsr`](fn._mm_setcsr.html) -pub const _MM_ROUND_DOWN: u32 = 0x2000; +pub const _MM_ROUND_DOWN: u32 = 0x2000; /// See [`_mm_setcsr`](fn._mm_setcsr.html) -pub const _MM_ROUND_UP: u32 = 0x4000; +pub const _MM_ROUND_UP: u32 = 0x4000; /// See [`_mm_setcsr`](fn._mm_setcsr.html) pub const _MM_ROUND_TOWARD_ZERO: u32 = 0x6000; -pub const _MM_ROUND_MASK: u32 = 0x6000; +pub const _MM_ROUND_MASK: u32 = 0x6000; -pub const _MM_FLUSH_ZERO_MASK: u32 = 0x8000; +pub const _MM_FLUSH_ZERO_MASK: u32 = 0x8000; /// See [`_mm_setcsr`](fn._mm_setcsr.html) -pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000; +pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000; /// See [`_mm_setcsr`](fn._mm_setcsr.html) -pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000; +pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000; #[inline(always)] #[allow(non_snake_case)] @@ -1517,7 +1534,7 @@ pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) { #[target_feature = "+sse"] pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) { let val = (_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | x; - //println!("setting csr={:x}", val); + // println!("setting csr={:x}", val); _mm_setcsr(val) } @@ -1584,8 +1601,8 @@ pub const _MM_HINT_NTA: i8 = 0; #[cfg_attr(test, assert_instr(prefetcht2, strategy = _MM_HINT_T2))] #[cfg_attr(test, assert_instr(prefetchnta, strategy = _MM_HINT_NTA))] pub unsafe fn _mm_prefetch(p: *const c_void, strategy: i8) { - // The `strategy` must be a compile-time constant, so we use a short form of - // `constify_imm8!` for now. + // The `strategy` must be a compile-time constant, so we use a short form + // of `constify_imm8!` for now. // We use the `llvm.prefetch` instrinsic with `rw` = 0 (read), and // `cache type` = 1 (data cache). `locality` is based on our `strategy`. macro_rules! pref { @@ -1612,7 +1629,9 @@ pub unsafe fn _mm_undefined_ps() -> f32x4 { #[inline(always)] #[allow(non_snake_case)] #[target_feature = "+sse"] -pub unsafe fn _MM_TRANSPOSE4_PS(row0: &mut f32x4, row1: &mut f32x4, row2: &mut f32x4, row3: &mut f32x4) { +pub unsafe fn _MM_TRANSPOSE4_PS( + row0: &mut f32x4, row1: &mut f32x4, row2: &mut f32x4, row3: &mut f32x4 +) { let tmp0 = _mm_unpacklo_ps(*row0, *row1); let tmp2 = _mm_unpacklo_ps(*row2, *row3); let tmp1 = _mm_unpackhi_ps(*row0, *row1); @@ -1625,7 +1644,7 @@ pub unsafe fn _MM_TRANSPOSE4_PS(row0: &mut f32x4, row1: &mut f32x4, row2: &mut f } #[allow(improper_ctypes)] -extern { +extern "C" { #[link_name = "llvm.x86.sse.add.ss"] fn addss(a: f32x4, b: f32x4) -> f32x4; #[link_name = "llvm.x86.sse.sub.ss"] @@ -1709,7 +1728,7 @@ mod tests { use v128::*; use x86::sse; use stdsimd_test::simd_test; - use test::black_box; // Used to inhibit constant-folding. + use test::black_box; // Used to inhibit constant-folding. #[simd_test = "sse"] unsafe fn _mm_add_ps() { @@ -1934,9 +1953,9 @@ mod tests { let c = f32x4::new(1.0, 5.0, 6.0, 7.0); let d = f32x4::new(2.0, 5.0, 6.0, 7.0); - let b1 = 0u32; // a.extract(0) < b.extract(0) - let c1 = 0u32; // a.extract(0) < c.extract(0) - let d1 = !0u32; // a.extract(0) < d.extract(0) + let b1 = 0u32; // a.extract(0) < b.extract(0) + let c1 = 0u32; // a.extract(0) < c.extract(0) + let d1 = !0u32; // a.extract(0) < d.extract(0) let rb: u32x4 = transmute(sse::_mm_cmplt_ss(a, b)); let eb: u32x4 = transmute(f32x4::new(transmute(b1), 2.0, 3.0, 4.0)); @@ -1960,9 +1979,9 @@ mod tests { let c = f32x4::new(1.0, 5.0, 6.0, 7.0); let d = f32x4::new(2.0, 5.0, 6.0, 7.0); - let b1 = 0u32; // a.extract(0) <= b.extract(0) - let c1 = !0u32; // a.extract(0) <= c.extract(0) - let d1 = !0u32; // a.extract(0) <= d.extract(0) + let b1 = 0u32; // a.extract(0) <= b.extract(0) + let c1 = !0u32; // a.extract(0) <= c.extract(0) + let d1 = !0u32; // a.extract(0) <= d.extract(0) let rb: u32x4 = transmute(sse::_mm_cmple_ss(a, b)); let eb: u32x4 = transmute(f32x4::new(transmute(b1), 2.0, 3.0, 4.0)); @@ -1986,9 +2005,9 @@ mod tests { let c = f32x4::new(1.0, 5.0, 6.0, 7.0); let d = f32x4::new(2.0, 5.0, 6.0, 7.0); - let b1 = !0u32; // a.extract(0) > b.extract(0) - let c1 = 0u32; // a.extract(0) > c.extract(0) - let d1 = 0u32; // a.extract(0) > d.extract(0) + let b1 = !0u32; // a.extract(0) > b.extract(0) + let c1 = 0u32; // a.extract(0) > c.extract(0) + let d1 = 0u32; // a.extract(0) > d.extract(0) let rb: u32x4 = transmute(sse::_mm_cmpgt_ss(a, b)); let eb: u32x4 = transmute(f32x4::new(transmute(b1), 2.0, 3.0, 4.0)); @@ -2012,9 +2031,9 @@ mod tests { let c = f32x4::new(1.0, 5.0, 6.0, 7.0); let d = f32x4::new(2.0, 5.0, 6.0, 7.0); - let b1 = !0u32; // a.extract(0) >= b.extract(0) - let c1 = !0u32; // a.extract(0) >= c.extract(0) - let d1 = 0u32; // a.extract(0) >= d.extract(0) + let b1 = !0u32; // a.extract(0) >= b.extract(0) + let c1 = !0u32; // a.extract(0) >= c.extract(0) + let d1 = 0u32; // a.extract(0) >= d.extract(0) let rb: u32x4 = transmute(sse::_mm_cmpge_ss(a, b)); let eb: u32x4 = transmute(f32x4::new(transmute(b1), 2.0, 3.0, 4.0)); @@ -2038,9 +2057,9 @@ mod tests { let c = f32x4::new(1.0, 5.0, 6.0, 7.0); let d = f32x4::new(2.0, 5.0, 6.0, 7.0); - let b1 = !0u32; // a.extract(0) != b.extract(0) - let c1 = 0u32; // a.extract(0) != c.extract(0) - let d1 = !0u32; // a.extract(0) != d.extract(0) + let b1 = !0u32; // a.extract(0) != b.extract(0) + let c1 = 0u32; // a.extract(0) != c.extract(0) + let d1 = !0u32; // a.extract(0) != d.extract(0) let rb: u32x4 = transmute(sse::_mm_cmpneq_ss(a, b)); let eb: u32x4 = transmute(f32x4::new(transmute(b1), 2.0, 3.0, 4.0)); @@ -2058,8 +2077,9 @@ mod tests { #[simd_test = "sse"] unsafe fn _mm_cmpnlt_ss() { // TODO: This test is exactly the same as for _mm_cmpge_ss, but there - // must be a difference. It may have to do with behavior in the presence - // of NaNs (signaling or quiet). If so, we should add tests for those. + // must be a difference. It may have to do with behavior in the + // presence of NaNs (signaling or quiet). If so, we should add tests + // for those. use std::mem::transmute; let a = f32x4::new(1.0, 2.0, 3.0, 4.0); @@ -2067,9 +2087,9 @@ mod tests { let c = f32x4::new(1.0, 5.0, 6.0, 7.0); let d = f32x4::new(2.0, 5.0, 6.0, 7.0); - let b1 = !0u32; // a.extract(0) >= b.extract(0) - let c1 = !0u32; // a.extract(0) >= c.extract(0) - let d1 = 0u32; // a.extract(0) >= d.extract(0) + let b1 = !0u32; // a.extract(0) >= b.extract(0) + let c1 = !0u32; // a.extract(0) >= c.extract(0) + let d1 = 0u32; // a.extract(0) >= d.extract(0) let rb: u32x4 = transmute(sse::_mm_cmpnlt_ss(a, b)); let eb: u32x4 = transmute(f32x4::new(transmute(b1), 2.0, 3.0, 4.0)); @@ -2087,7 +2107,8 @@ mod tests { #[simd_test = "sse"] unsafe fn _mm_cmpnle_ss() { // TODO: This test is exactly the same as for _mm_cmpgt_ss, but there - // must be a difference. It may have to do with behavior in the presence + // must be a difference. It may have to do with behavior in the + // presence // of NaNs (signaling or quiet). If so, we should add tests for those. use std::mem::transmute; @@ -2096,9 +2117,9 @@ mod tests { let c = f32x4::new(1.0, 5.0, 6.0, 7.0); let d = f32x4::new(2.0, 5.0, 6.0, 7.0); - let b1 = !0u32; // a.extract(0) > b.extract(0) - let c1 = 0u32; // a.extract(0) > c.extract(0) - let d1 = 0u32; // a.extract(0) > d.extract(0) + let b1 = !0u32; // a.extract(0) > b.extract(0) + let c1 = 0u32; // a.extract(0) > c.extract(0) + let d1 = 0u32; // a.extract(0) > d.extract(0) let rb: u32x4 = transmute(sse::_mm_cmpnle_ss(a, b)); let eb: u32x4 = transmute(f32x4::new(transmute(b1), 2.0, 3.0, 4.0)); @@ -2116,8 +2137,9 @@ mod tests { #[simd_test = "sse"] unsafe fn _mm_cmpngt_ss() { // TODO: This test is exactly the same as for _mm_cmple_ss, but there - // must be a difference. It may have to do with behavior in the presence - // of NaNs (signaling or quiet). If so, we should add tests for those. + // must be a difference. It may have to do with behavior in the + // presence of NaNs (signaling or quiet). If so, we should add tests + // for those. use std::mem::transmute; let a = f32x4::new(1.0, 2.0, 3.0, 4.0); @@ -2125,9 +2147,9 @@ mod tests { let c = f32x4::new(1.0, 5.0, 6.0, 7.0); let d = f32x4::new(2.0, 5.0, 6.0, 7.0); - let b1 = 0u32; // a.extract(0) <= b.extract(0) - let c1 = !0u32; // a.extract(0) <= c.extract(0) - let d1 = !0u32; // a.extract(0) <= d.extract(0) + let b1 = 0u32; // a.extract(0) <= b.extract(0) + let c1 = !0u32; // a.extract(0) <= c.extract(0) + let d1 = !0u32; // a.extract(0) <= d.extract(0) let rb: u32x4 = transmute(sse::_mm_cmpngt_ss(a, b)); let eb: u32x4 = transmute(f32x4::new(transmute(b1), 2.0, 3.0, 4.0)); @@ -2145,8 +2167,9 @@ mod tests { #[simd_test = "sse"] unsafe fn _mm_cmpnge_ss() { // TODO: This test is exactly the same as for _mm_cmplt_ss, but there - // must be a difference. It may have to do with behavior in the presence - // of NaNs (signaling or quiet). If so, we should add tests for those. + // must be a difference. It may have to do with behavior in the + // presence of NaNs (signaling or quiet). If so, we should add tests + // for those. use std::mem::transmute; let a = f32x4::new(1.0, 2.0, 3.0, 4.0); @@ -2154,9 +2177,9 @@ mod tests { let c = f32x4::new(1.0, 5.0, 6.0, 7.0); let d = f32x4::new(2.0, 5.0, 6.0, 7.0); - let b1 = 0u32; // a.extract(0) < b.extract(0) - let c1 = 0u32; // a.extract(0) < c.extract(0) - let d1 = !0u32; // a.extract(0) < d.extract(0) + let b1 = 0u32; // a.extract(0) < b.extract(0) + let c1 = 0u32; // a.extract(0) < c.extract(0) + let d1 = !0u32; // a.extract(0) < d.extract(0) let rb: u32x4 = transmute(sse::_mm_cmpnge_ss(a, b)); let eb: u32x4 = transmute(f32x4::new(transmute(b1), 2.0, 3.0, 4.0)); @@ -2181,9 +2204,9 @@ mod tests { let c = f32x4::new(NAN, 5.0, 6.0, 7.0); let d = f32x4::new(2.0, 5.0, 6.0, 7.0); - let b1 = !0u32; // a.extract(0) ord b.extract(0) - let c1 = 0u32; // a.extract(0) ord c.extract(0) - let d1 = !0u32; // a.extract(0) ord d.extract(0) + let b1 = !0u32; // a.extract(0) ord b.extract(0) + let c1 = 0u32; // a.extract(0) ord c.extract(0) + let d1 = !0u32; // a.extract(0) ord d.extract(0) let rb: u32x4 = transmute(sse::_mm_cmpord_ss(a, b)); let eb: u32x4 = transmute(f32x4::new(transmute(b1), 2.0, 3.0, 4.0)); @@ -2208,9 +2231,9 @@ mod tests { let c = f32x4::new(NAN, 5.0, 6.0, 7.0); let d = f32x4::new(2.0, 5.0, 6.0, 7.0); - let b1 = 0u32; // a.extract(0) unord b.extract(0) - let c1 = !0u32; // a.extract(0) unord c.extract(0) - let d1 = 0u32; // a.extract(0) unord d.extract(0) + let b1 = 0u32; // a.extract(0) unord b.extract(0) + let c1 = !0u32; // a.extract(0) unord c.extract(0) + let d1 = 0u32; // a.extract(0) unord d.extract(0) let rb: u32x4 = transmute(sse::_mm_cmpunord_ss(a, b)); let eb: u32x4 = transmute(f32x4::new(transmute(b1), 2.0, 3.0, 4.0)); @@ -2418,9 +2441,16 @@ mod tests { let r = sse::_mm_comieq_ss(a, b); - assert_eq!(ee[i], r, + assert_eq!( + ee[i], + r, "_mm_comieq_ss({:?}, {:?}) = {}, expected: {} (i={})", - a, b, r, ee[i], i); + a, + b, + r, + ee[i], + i + ); } } @@ -2439,9 +2469,16 @@ mod tests { let r = sse::_mm_comilt_ss(a, b); - assert_eq!(ee[i], r, + assert_eq!( + ee[i], + r, "_mm_comilt_ss({:?}, {:?}) = {}, expected: {} (i={})", - a, b, r, ee[i], i); + a, + b, + r, + ee[i], + i + ); } } @@ -2460,9 +2497,16 @@ mod tests { let r = sse::_mm_comile_ss(a, b); - assert_eq!(ee[i], r, + assert_eq!( + ee[i], + r, "_mm_comile_ss({:?}, {:?}) = {}, expected: {} (i={})", - a, b, r, ee[i], i); + a, + b, + r, + ee[i], + i + ); } } @@ -2481,9 +2525,16 @@ mod tests { let r = sse::_mm_comigt_ss(a, b); - assert_eq!(ee[i], r, + assert_eq!( + ee[i], + r, "_mm_comigt_ss({:?}, {:?}) = {}, expected: {} (i={})", - a, b, r, ee[i], i); + a, + b, + r, + ee[i], + i + ); } } @@ -2502,9 +2553,16 @@ mod tests { let r = sse::_mm_comige_ss(a, b); - assert_eq!(ee[i], r, + assert_eq!( + ee[i], + r, "_mm_comige_ss({:?}, {:?}) = {}, expected: {} (i={})", - a, b, r, ee[i], i); + a, + b, + r, + ee[i], + i + ); } } @@ -2523,9 +2581,16 @@ mod tests { let r = sse::_mm_comineq_ss(a, b); - assert_eq!(ee[i], r, + assert_eq!( + ee[i], + r, "_mm_comineq_ss({:?}, {:?}) = {}, expected: {} (i={})", - a, b, r, ee[i], i); + a, + b, + r, + ee[i], + i + ); } } @@ -2544,9 +2609,16 @@ mod tests { let r = sse::_mm_ucomieq_ss(a, b); - assert_eq!(ee[i], r, + assert_eq!( + ee[i], + r, "_mm_ucomieq_ss({:?}, {:?}) = {}, expected: {} (i={})", - a, b, r, ee[i], i); + a, + b, + r, + ee[i], + i + ); } } @@ -2565,9 +2637,16 @@ mod tests { let r = sse::_mm_ucomilt_ss(a, b); - assert_eq!(ee[i], r, + assert_eq!( + ee[i], + r, "_mm_ucomilt_ss({:?}, {:?}) = {}, expected: {} (i={})", - a, b, r, ee[i], i); + a, + b, + r, + ee[i], + i + ); } } @@ -2586,9 +2665,16 @@ mod tests { let r = sse::_mm_ucomile_ss(a, b); - assert_eq!(ee[i], r, + assert_eq!( + ee[i], + r, "_mm_ucomile_ss({:?}, {:?}) = {}, expected: {} (i={})", - a, b, r, ee[i], i); + a, + b, + r, + ee[i], + i + ); } } @@ -2607,9 +2693,16 @@ mod tests { let r = sse::_mm_ucomigt_ss(a, b); - assert_eq!(ee[i], r, + assert_eq!( + ee[i], + r, "_mm_ucomigt_ss({:?}, {:?}) = {}, expected: {} (i={})", - a, b, r, ee[i], i); + a, + b, + r, + ee[i], + i + ); } } @@ -2628,9 +2721,16 @@ mod tests { let r = sse::_mm_ucomige_ss(a, b); - assert_eq!(ee[i], r, + assert_eq!( + ee[i], + r, "_mm_ucomige_ss({:?}, {:?}) = {}, expected: {} (i={})", - a, b, r, ee[i], i); + a, + b, + r, + ee[i], + i + ); } } @@ -2649,9 +2749,16 @@ mod tests { let r = sse::_mm_ucomineq_ss(a, b); - assert_eq!(ee[i], r, + assert_eq!( + ee[i], + r, "_mm_ucomineq_ss({:?}, {:?}) = {}, expected: {} (i={})", - a, b, r, ee[i], i); + a, + b, + r, + ee[i], + i + ); } } @@ -2659,12 +2766,12 @@ mod tests { unsafe fn _mm_comieq_ss_vs_ucomieq_ss() { // If one of the arguments is a quiet NaN `comieq_ss` should signal an // Invalid Operation Exception while `ucomieq_ss` should not. - use std::f32::NAN; // This is a quiet NaN. + use std::f32::NAN; // This is a quiet NaN. let aa = &[3.0f32, NAN, 23.0, NAN]; let bb = &[3.0f32, 47.5, NAN, NAN]; let ee = &[1i32, 0, 0, 0]; - let exc = &[0u32, 1, 1, 1]; // Should comieq_ss signal an exception? + let exc = &[0u32, 1, 1, 1]; // Should comieq_ss signal an exception? for i in 0..4 { let a = f32x4::new(aa[i], 1.0, 2.0, 3.0); @@ -2678,16 +2785,40 @@ mod tests { let r2 = sse::_mm_ucomieq_ss(*black_box(&a), b); let s2 = sse::_MM_GET_EXCEPTION_STATE(); - assert_eq!(ee[i], r1, + assert_eq!( + ee[i], + r1, "_mm_comeq_ss({:?}, {:?}) = {}, expected: {} (i={})", - a, b, r1, ee[i], i); - assert_eq!(ee[i], r2, + a, + b, + r1, + ee[i], + i + ); + assert_eq!( + ee[i], + r2, "_mm_ucomeq_ss({:?}, {:?}) = {}, expected: {} (i={})", - a, b, r2, ee[i], i); - assert_eq!(s1, exc[i] * sse::_MM_EXCEPT_INVALID, - "_mm_comieq_ss() set exception flags: {} (i={})", s1, i); - assert_eq!(s2, 0, // ucomieq_ss should not signal an exception - "_mm_ucomieq_ss() set exception flags: {} (i={})", s2, i); + a, + b, + r2, + ee[i], + i + ); + assert_eq!( + s1, + exc[i] * sse::_MM_EXCEPT_INVALID, + "_mm_comieq_ss() set exception flags: {} (i={})", + s1, + i + ); + assert_eq!( + s2, + 0, // ucomieq_ss should not signal an exception + "_mm_ucomieq_ss() set exception flags: {} (i={})", + s2, + i + ); } } @@ -2696,14 +2827,20 @@ mod tests { use std::f32::NAN; use std::i32::MIN; let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1]; - let result = &[42i32, -3, MIN, 0, MIN, 2147483520]; + let result = &[42i32, -3, MIN, 0, MIN, 2147483520]; for i in 0..inputs.len() { let x = f32x4::new(inputs[i], 1.0, 3.0, 4.0); let e = result[i]; let r = sse::_mm_cvtss_si32(x); - assert_eq!(e, r, + assert_eq!( + e, + r, "TestCase #{} _mm_cvtss_si32({:?}) = {}, expected: {}", - i, x, r, e); + i, + x, + r, + e + ); } } @@ -2713,23 +2850,29 @@ mod tests { use std::f32::NAN; use std::i64::MIN; let inputs = &[ - (42.0f32, 42i64), - (-31.4, -31), - (-33.5, -34), - (-34.5, -34), - (4.0e10, 40_000_000_000), + (42.0f32, 42i64), + (-31.4, -31), + (-33.5, -34), + (-34.5, -34), + (4.0e10, 40_000_000_000), (4.0e-10, 0), (NAN, MIN), (2147483500.1, 2147483520), - (9.223371e18, 9223370937343148032) + (9.223371e18, 9223370937343148032), ]; for i in 0..inputs.len() { let (xi, e) = inputs[i]; let x = f32x4::new(xi, 1.0, 3.0, 4.0); let r = sse::_mm_cvtss_si64(x); - assert_eq!(e, r, + assert_eq!( + e, + r, "TestCase #{} _mm_cvtss_si64({:?}) = {}, expected: {}", - i, x, r, e); + i, + x, + r, + e + ); } } @@ -2738,13 +2881,13 @@ mod tests { use std::f32::NAN; use std::i32::MIN; let inputs = &[ - (42.0f32, 42i32), - (-31.4, -31), - (-33.5, -33), - (-34.5, -34), - (10.999, 10), - (-5.99, -5), - (4.0e10, MIN), + (42.0f32, 42i32), + (-31.4, -31), + (-33.5, -33), + (-34.5, -34), + (10.999, 10), + (-5.99, -5), + (4.0e10, MIN), (4.0e-10, 0), (NAN, MIN), (2147483500.1, 2147483520), @@ -2753,9 +2896,15 @@ mod tests { let (xi, e) = inputs[i]; let x = f32x4::new(xi, 1.0, 3.0, 4.0); let r = sse::_mm_cvttss_si32(x); - assert_eq!(e, r, + assert_eq!( + e, + r, "TestCase #{} _mm_cvttss_si32({:?}) = {}, expected: {}", - i, x, r, e); + i, + x, + r, + e + ); } } @@ -2765,13 +2914,13 @@ mod tests { use std::f32::NAN; use std::i64::MIN; let inputs = &[ - (42.0f32, 42i64), - (-31.4, -31), - (-33.5, -33), - (-34.5, -34), - (10.999, 10), - (-5.99, -5), - (4.0e10, 40_000_000_000), + (42.0f32, 42i64), + (-31.4, -31), + (-33.5, -33), + (-34.5, -34), + (10.999, 10), + (-5.99, -5), + (4.0e10, 40_000_000_000), (4.0e-10, 0), (NAN, MIN), (2147483500.1, 2147483520), @@ -2782,19 +2931,25 @@ mod tests { let (xi, e) = inputs[i]; let x = f32x4::new(xi, 1.0, 3.0, 4.0); let r = sse::_mm_cvttss_si64(x); - assert_eq!(e, r, + assert_eq!( + e, + r, "TestCase #{} _mm_cvttss_si64({:?}) = {}, expected: {}", - i, x, r, e); + i, + x, + r, + e + ); } } #[simd_test = "sse"] pub unsafe fn _mm_cvtsi32_ss() { let inputs = &[ - (4555i32, 4555.0f32), + (4555i32, 4555.0f32), (322223333, 322223330.0), - (-432, -432.0), - (-322223333, -322223330.0) + (-432, -432.0), + (-322223333, -322223330.0), ]; for i in 0..inputs.len() { @@ -2802,9 +2957,16 @@ mod tests { let a = f32x4::new(5.0, 6.0, 7.0, 8.0); let r = sse::_mm_cvtsi32_ss(a, x); let e = a.replace(0, f); - assert_eq!(e, r, + assert_eq!( + e, + r, "TestCase #{} _mm_cvtsi32_ss({:?}, {}) = {:?}, expected: {:?}", - i, a, x, r, e); + i, + a, + x, + r, + e + ); } } @@ -2812,12 +2974,12 @@ mod tests { #[cfg(target_arch = "x86_64")] pub unsafe fn _mm_cvtsi64_ss() { let inputs = &[ - (4555i64, 4555.0f32), + (4555i64, 4555.0f32), (322223333, 322223330.0), - (-432, -432.0), + (-432, -432.0), (-322223333, -322223330.0), (9223372036854775807, 9.223372e18), - (-9223372036854775808, -9.223372e18) + (-9223372036854775808, -9.223372e18), ]; for i in 0..inputs.len() { @@ -2825,9 +2987,16 @@ mod tests { let a = f32x4::new(5.0, 6.0, 7.0, 8.0); let r = sse::_mm_cvtsi64_ss(a, x); let e = a.replace(0, f); - assert_eq!(e, r, + assert_eq!( + e, + r, "TestCase #{} _mm_cvtsi64_ss({:?}, {}) = {:?}, expected: {:?}", - i, a, x, r, e); + i, + a, + x, + r, + e + ); } } @@ -2854,14 +3023,22 @@ mod tests { #[simd_test = "sse"] unsafe fn _mm_set_ps() { let r = sse::_mm_set_ps( - black_box(1.0), black_box(2.0), black_box(3.0), black_box(4.0)); + black_box(1.0), + black_box(2.0), + black_box(3.0), + black_box(4.0), + ); assert_eq!(r, f32x4::new(4.0, 3.0, 2.0, 1.0)); } #[simd_test = "sse"] unsafe fn _mm_setr_ps() { let r = sse::_mm_setr_ps( - black_box(1.0), black_box(2.0), black_box(3.0), black_box(4.0)); + black_box(1.0), + black_box(2.0), + black_box(3.0), + black_box(4.0), + ); assert_eq!(r, f32x4::new(1.0, 2.0, 3.0, 4.0)); } @@ -3170,7 +3347,7 @@ mod tests { sse::_mm_setcsr(saved_csr); let exp = f32x4::new(0.0, 0.0, 0.0, 1.0); - assert_eq!(r, exp); // first component is a denormalized f32 + assert_eq!(r, exp); // first component is a denormalized f32 } #[simd_test = "sse"] @@ -3188,7 +3365,7 @@ mod tests { sse::_mm_setcsr(saved_csr); let exp = f32x4::new(1.1e-39, 0.0, 0.0, 1.0); - assert_eq!(r, exp); // first component is a denormalized f32 + assert_eq!(r, exp); // first component is a denormalized f32 } #[simd_test = "sse"] @@ -3198,7 +3375,7 @@ mod tests { let a = f32x4::new(1.1e-36, 0.0, 0.0, 1.0); let b = f32x4::new(1e-5, 0.0, 0.0, 1.0); - assert_eq!(sse::_MM_GET_EXCEPTION_STATE(), 0); // just to be sure + assert_eq!(sse::_MM_GET_EXCEPTION_STATE(), 0); // just to be sure let r = sse::_mm_mul_ps(*black_box(&a), *black_box(&b)); diff --git a/library/stdarch/src/x86/sse2.rs b/library/stdarch/src/x86/sse2.rs index 8b992896d5ea..c08fab33b69b 100644 --- a/library/stdarch/src/x86/sse2.rs +++ b/library/stdarch/src/x86/sse2.rs @@ -5,9 +5,8 @@ use std::mem; use std::os::raw::c_void; use std::ptr; -use simd_llvm::{ - simd_cast, simd_shuffle2, simd_shuffle4, simd_shuffle8, simd_shuffle16, -}; +use simd_llvm::{simd_cast, simd_shuffle16, simd_shuffle2, simd_shuffle4, + simd_shuffle8}; use x86::__m128i; use v128::*; use v64::*; @@ -317,7 +316,9 @@ pub unsafe fn _mm_subs_epu16(a: u16x8, b: u16x8) -> u16x8 { #[cfg_attr(test, assert_instr(pslldq, imm8 = 1))] pub unsafe fn _mm_slli_si128(a: __m128i, imm8: i32) -> __m128i { let (zero, imm8) = (__m128i::splat(0), imm8 as u32); - const fn sub(a: u32, b: u32) -> u32 { a - b } + const fn sub(a: u32, b: u32) -> u32 { + a - b + } macro_rules! shuffle { ($shift:expr) => { simd_shuffle16::<__m128i, __m128i>(zero, a, [ @@ -333,14 +334,22 @@ pub unsafe fn _mm_slli_si128(a: __m128i, imm8: i32) -> __m128i { } } match imm8 { - 0 => shuffle!(0), 1 => shuffle!(1), - 2 => shuffle!(2), 3 => shuffle!(3), - 4 => shuffle!(4), 5 => shuffle!(5), - 6 => shuffle!(6), 7 => shuffle!(7), - 8 => shuffle!(8), 9 => shuffle!(9), - 10 => shuffle!(10), 11 => shuffle!(11), - 12 => shuffle!(12), 13 => shuffle!(13), - 14 => shuffle!(14), 15 => shuffle!(15), + 0 => shuffle!(0), + 1 => shuffle!(1), + 2 => shuffle!(2), + 3 => shuffle!(3), + 4 => shuffle!(4), + 5 => shuffle!(5), + 6 => shuffle!(6), + 7 => shuffle!(7), + 8 => shuffle!(8), + 9 => shuffle!(9), + 10 => shuffle!(10), + 11 => shuffle!(11), + 12 => shuffle!(12), + 13 => shuffle!(13), + 14 => shuffle!(14), + 15 => shuffle!(15), _ => shuffle!(16), } } @@ -365,7 +374,7 @@ pub unsafe fn _mm_bsrli_si128(a: __m128i, imm8: i32) -> __m128i { #[inline(always)] #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(psllw))] -pub unsafe fn _mm_slli_epi16(a: i16x8, imm8: i32) -> i16x8 { +pub unsafe fn _mm_slli_epi16(a: i16x8, imm8: i32) -> i16x8 { pslliw(a, imm8) } @@ -454,7 +463,9 @@ pub unsafe fn _mm_sra_epi32(a: i32x4, count: i32x4) -> i32x4 { #[cfg_attr(test, assert_instr(psrldq, imm8 = 1))] pub unsafe fn _mm_srli_si128(a: __m128i, imm8: i32) -> __m128i { let (zero, imm8) = (__m128i::splat(0), imm8 as u32); - const fn add(a: u32, b: u32) -> u32 { a + b } + const fn add(a: u32, b: u32) -> u32 { + a + b + } macro_rules! shuffle { ($shift:expr) => { simd_shuffle16::<__m128i, __m128i>(a, zero, [ @@ -470,14 +481,22 @@ pub unsafe fn _mm_srli_si128(a: __m128i, imm8: i32) -> __m128i { } } match imm8 { - 0 => shuffle!(0), 1 => shuffle!(1), - 2 => shuffle!(2), 3 => shuffle!(3), - 4 => shuffle!(4), 5 => shuffle!(5), - 6 => shuffle!(6), 7 => shuffle!(7), - 8 => shuffle!(8), 9 => shuffle!(9), - 10 => shuffle!(10), 11 => shuffle!(11), - 12 => shuffle!(12), 13 => shuffle!(13), - 14 => shuffle!(14), 15 => shuffle!(15), + 0 => shuffle!(0), + 1 => shuffle!(1), + 2 => shuffle!(2), + 3 => shuffle!(3), + 4 => shuffle!(4), + 5 => shuffle!(5), + 6 => shuffle!(6), + 7 => shuffle!(7), + 8 => shuffle!(8), + 9 => shuffle!(9), + 10 => shuffle!(10), + 11 => shuffle!(11), + 12 => shuffle!(12), + 13 => shuffle!(13), + 14 => shuffle!(14), + 15 => shuffle!(15), _ => shuffle!(16), } } @@ -487,7 +506,7 @@ pub unsafe fn _mm_srli_si128(a: __m128i, imm8: i32) -> __m128i { #[inline(always)] #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(psrlw))] -pub unsafe fn _mm_srli_epi16(a: i16x8, imm8: i32) -> i16x8 { +pub unsafe fn _mm_srli_epi16(a: i16x8, imm8: i32) -> i16x8 { psrliw(a, imm8) } @@ -649,7 +668,7 @@ pub unsafe fn _mm_cmplt_epi32(a: i32x4, b: i32x4) -> i32x4 { #[inline(always)] #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(cvtdq2pd))] -pub unsafe fn _mm_cvtepi32_pd(a: i32x4) -> f64x2 { +pub unsafe fn _mm_cvtepi32_pd(a: i32x4) -> f64x2 { simd_cast::(simd_shuffle2(a, a, [0, 1])) } @@ -777,7 +796,7 @@ pub unsafe fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> i32x4 { #[target_feature = "+sse2"] // no particular instruction to test pub unsafe fn _mm_set_epi16( - e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16, + e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16 ) -> i16x8 { i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7) } @@ -790,6 +809,7 @@ pub unsafe fn _mm_set_epi8( e15: i8, e14: i8, e13: i8, e12: i8, e11: i8, e10: i8, e9: i8, e8: i8, e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8, ) -> i8x16 { + #[cfg_attr(rustfmt, rustfmt_skip)] i8x16::new( e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, ) @@ -840,7 +860,7 @@ pub unsafe fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> i32x4 { #[target_feature = "+sse2"] // no particular instruction to test pub unsafe fn _mm_setr_epi16( - e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16, + e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16 ) -> i16x8 { i16x8::new(e7, e6, e5, e4, e3, e2, e1, e0) } @@ -853,6 +873,7 @@ pub unsafe fn _mm_setr_epi8( e15: i8, e14: i8, e13: i8, e12: i8, e11: i8, e10: i8, e9: i8, e8: i8, e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8, ) -> i8x16 { + #[cfg_attr(rustfmt, rustfmt_skip)] i8x16::new( e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0, ) @@ -895,7 +916,8 @@ pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i { ptr::copy_nonoverlapping( mem_addr as *const u8, &mut dst as *mut __m128i as *mut u8, - mem::size_of::<__m128i>()); + mem::size_of::<__m128i>(), + ); dst } @@ -934,7 +956,8 @@ pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) { ptr::copy_nonoverlapping( &a as *const _ as *const u8, mem_addr as *mut u8, - mem::size_of::<__m128i>()); + mem::size_of::<__m128i>(), + ); } /// Store the lower 64-bit integer `a` to a memory location. @@ -945,7 +968,10 @@ pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) { // no particular instruction to test pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) { ptr::copy_nonoverlapping( - &a as *const _ as *const u8, mem_addr as *mut u8, 8); + &a as *const _ as *const u8, + mem_addr as *mut u8, + 8, + ); } /// Return a vector where the low element is extracted from `a` and its upper @@ -1076,7 +1102,9 @@ pub unsafe fn _mm_shuffle_epi32(a: i32x4, imm8: i32) -> i32x4 { pub unsafe fn _mm_shufflehi_epi16(a: i16x8, imm8: i32) -> i16x8 { // See _mm_shuffle_epi32. let imm8 = (imm8 & 0xFF) as u8; - const fn add4(x: u32) -> u32 { x + 4 } + const fn add4(x: u32) -> u32 { + x + 4 + } macro_rules! shuffle_done { ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => { @@ -1183,10 +1211,11 @@ pub unsafe fn _mm_shufflelo_epi16(a: i16x8, imm8: i32) -> i16x8 { #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(punpckhbw))] pub unsafe fn _mm_unpackhi_epi8(a: i8x16, b: i8x16) -> i8x16 { - simd_shuffle16(a, b, [ - 8, 24, 9, 25, 10, 26, 11, 27, - 12, 28, 13, 29, 14, 30, 15, 31, - ]) + simd_shuffle16( + a, + b, + [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31], + ) } /// Unpack and interleave 16-bit integers from the high half of `a` and `b`. @@ -1218,10 +1247,11 @@ pub unsafe fn _mm_unpackhi_epi64(a: i64x2, b: i64x2) -> i64x2 { #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(punpcklbw))] pub unsafe fn _mm_unpacklo_epi8(a: i8x16, b: i8x16) -> i8x16 { - simd_shuffle16(a, b, [ - 0, 16, 1, 17, 2, 18, 3, 19, - 4, 20, 5, 21, 6, 22, 7, 23, - ]) + simd_shuffle16( + a, + b, + [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23], + ) } /// Unpack and interleave 16-bit integers from the low half of `a` and `b`. @@ -1718,7 +1748,8 @@ pub unsafe fn _mm_ucomineq_sd(a: f64x2, b: f64x2) -> bool { mem::transmute(ucomineqsd(a, b) as u8) } -/// Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements +/// Convert packed double-precision (64-bit) floating-point elements in "a" to +/// packed single-precision (32-bit) floating-point elements #[inline(always)] #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(cvtpd2ps))] @@ -1726,8 +1757,8 @@ pub unsafe fn _mm_cvtpd_ps(a: f64x2) -> f32x4 { cvtpd2ps(a) } - -/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed +/// Convert packed single-precision (32-bit) floating-point elements in `a` to +/// packed /// double-precision (64-bit) floating-point elements. #[inline(always)] #[target_feature = "+sse2"] @@ -1736,7 +1767,8 @@ pub unsafe fn _mm_cvtps_pd(a: f32x4) -> f64x2 { cvtps2pd(a) } -/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers. +/// Convert packed double-precision (64-bit) floating-point elements in `a` to +/// packed 32-bit integers. #[inline(always)] #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(cvtpd2dq))] @@ -1744,7 +1776,8 @@ pub unsafe fn _mm_cvtpd_epi32(a: f64x2) -> i32x4 { cvtpd2dq(a) } -/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer. +/// Convert the lower double-precision (64-bit) floating-point element in a to +/// a 32-bit integer. #[inline(always)] #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(cvtsd2si))] @@ -1752,7 +1785,8 @@ pub unsafe fn _mm_cvtsd_si32(a: f64x2) -> i32 { cvtsd2si(a) } -/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer. +/// Convert the lower double-precision (64-bit) floating-point element in a to +/// a 64-bit integer. #[cfg(target_arch = "x86_64")] #[inline(always)] #[target_feature = "+sse2"] @@ -1761,9 +1795,10 @@ pub unsafe fn _mm_cvtsd_si64(a: f64x2) -> i64 { cvtsd2si64(a) } -/// Convert the lower double-precision (64-bit) floating-point element in `b` to a -/// single-precision (32-bit) floating-point element, store the result in the lower element -/// of the return value, and copy the upper element from `a` to the upper element the return value. +/// Convert the lower double-precision (64-bit) floating-point element in `b` +/// to a single-precision (32-bit) floating-point element, store the result in +/// the lower element of the return value, and copy the upper element from `a` +/// to the upper element the return value. #[inline(always)] #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(cvtsd2ss))] @@ -1771,18 +1806,19 @@ pub unsafe fn _mm_cvtsd_ss(a: f32x4, b: f64x2) -> f32x4 { cvtsd2ss(a, b) } -/// Convert the lower single-precision (32-bit) floating-point element in `b` to a -/// double-precision (64-bit) floating-point element, store the result in the lower element -/// of the return value, and copy the upper element from `a` to the upper element the return value. +/// Convert the lower single-precision (32-bit) floating-point element in `b` +/// to a double-precision (64-bit) floating-point element, store the result in +/// the lower element of the return value, and copy the upper element from `a` +/// to the upper element the return value. #[inline(always)] #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(cvtss2sd))] -pub unsafe fn _mm_cvtss_sd(a: f64x2, b: f32x4 ) -> f64x2 { +pub unsafe fn _mm_cvtss_sd(a: f64x2, b: f32x4) -> f64x2 { cvtss2sd(a, b) } -/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed -/// 32-bit integers with truncation. +/// Convert packed double-precision (64-bit) floating-point elements in `a` to +/// packed 32-bit integers with truncation. #[inline(always)] #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(cvttpd2dq))] @@ -1790,8 +1826,8 @@ pub unsafe fn _mm_cvttpd_epi32(a: f64x2) -> i32x4 { cvttpd2dq(a) } -/// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer -/// with truncation. +/// Convert the lower double-precision (64-bit) floating-point element in `a` +/// to a 32-bit integer with truncation. #[inline(always)] #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(cvttsd2si))] @@ -1799,8 +1835,8 @@ pub unsafe fn _mm_cvttsd_si32(a: f64x2) -> i32 { cvttsd2si(a) } -/// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer -/// with truncation. +/// Convert the lower double-precision (64-bit) floating-point element in `a` +/// to a 64-bit integer with truncation. #[cfg(target_arch = "x86_64")] #[inline(always)] #[target_feature = "+sse2"] @@ -1809,8 +1845,8 @@ pub unsafe fn _mm_cvttsd_si64(a: f64x2) -> i64 { cvttsd2si64(a) } -/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit -/// integers with truncation +/// Convert packed single-precision (32-bit) floating-point elements in `a` to +/// packed 32-bit integers with truncation. #[inline(always)] #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(cvttps2dq))] @@ -1818,45 +1854,48 @@ pub unsafe fn _mm_cvttps_epi32(a: f32x4) -> i32x4 { cvttps2dq(a) } -/// Copy double-precision (64-bit) floating-point element `a` to the lower element of the -/// packed 64-bit return value +/// Copy double-precision (64-bit) floating-point element `a` to the lower +/// element of the packed 64-bit return value. #[inline(always)] #[target_feature = "+sse2"] pub unsafe fn _mm_set_sd(a: f64) -> f64x2 { f64x2::new(a, 0_f64) } -/// Broadcast double-precision (64-bit) floating-point value a to all elements of the return value +/// Broadcast double-precision (64-bit) floating-point value a to all elements +/// of the return value. #[inline(always)] #[target_feature = "+sse2"] pub unsafe fn _mm_set1_pd(a: f64) -> f64x2 { f64x2::new(a, a) } -/// Broadcast double-precision (64-bit) floating-point value a to all elements of the return value +/// Broadcast double-precision (64-bit) floating-point value a to all elements +/// of the return value. #[inline(always)] #[target_feature = "+sse2"] pub unsafe fn _mm_set_pd1(a: f64) -> f64x2 { f64x2::new(a, a) } -/// Set packed double-precision (64-bit) floating-point elements in the return value with the -/// supplied values. +/// Set packed double-precision (64-bit) floating-point elements in the return +/// value with the supplied values. #[inline(always)] #[target_feature = "+sse2"] pub unsafe fn _mm_set_pd(a: f64, b: f64) -> f64x2 { f64x2::new(b, a) } -/// Set packed double-precision (64-bit) floating-point elements in the return value with the -/// supplied values in reverse order. +/// Set packed double-precision (64-bit) floating-point elements in the return +/// value with the supplied values in reverse order. #[inline(always)] #[target_feature = "+sse2"] pub unsafe fn _mm_setr_pd(a: f64, b: f64) -> f64x2 { f64x2::new(a, b) } -/// returns packed double-precision (64-bit) floating-point elements with all zeros. +/// returns packed double-precision (64-bit) floating-point elements with all +/// zeros. #[inline(always)] #[target_feature = "+sse2"] pub unsafe fn _mm_setzero_pd() -> f64x2 { @@ -1876,9 +1915,10 @@ pub unsafe fn _mm_movemask_pd(a: f64x2) -> i32 { -/// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) -/// from memory into the returned vector. mem_addr must be aligned on a 16-byte boundary or -/// a general-protection exception may be generated. +/// Load 128-bits (composed of 2 packed double-precision (64-bit) +/// floating-point elements) from memory into the returned vector. +/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection +/// exception may be generated. #[inline(always)] #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(movaps))] @@ -1886,9 +1926,9 @@ pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> f64x2 { *(mem_addr as *const f64x2) } -/// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` -/// into memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception -/// may be generated. +/// Store 128-bits (composed of 2 packed double-precision (64-bit) +/// floating-point elements) from `a` into memory. `mem_addr` must be aligned +/// on a 16-byte boundary or a general-protection exception may be generated. #[inline(always)] #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(movaps))] @@ -1906,12 +1946,13 @@ pub unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: f64x2) { ptr::copy_nonoverlapping( &a as *const f64x2 as *const u8, mem_addr as *mut u8, - mem::size_of::()); + mem::size_of::(), + ); } -/// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous -/// elements in memory. `mem_addr` must be aligned on a 16-byte boundary or a general-protection -/// exception may be generated. +/// Store the lower double-precision (64-bit) floating-point element from `a` +/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a +/// 16-byte boundary or a general-protection exception may be generated. #[inline(always)] #[target_feature = "+sse2"] pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: f64x2) { @@ -1919,9 +1960,9 @@ pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: f64x2) { *(mem_addr as *mut f64x2) = b; } -/// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous -/// elements in memory. `mem_addr` must be aligned on a 16-byte boundary or a general-protection -/// exception may be generated. +/// Store the lower double-precision (64-bit) floating-point element from `a` +/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a +/// 16-byte boundary or a general-protection exception may be generated. #[inline(always)] #[target_feature = "+sse2"] pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: f64x2) { @@ -1929,8 +1970,10 @@ pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: f64x2) { *(mem_addr as *mut f64x2) = b; } -/// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse order. -/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. +/// Store 2 double-precision (64-bit) floating-point elements from `a` into +/// memory in reverse order. +/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection +/// exception may be generated. #[inline(always)] #[target_feature = "+sse2"] pub unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: f64x2) { @@ -1956,9 +1999,9 @@ pub unsafe fn _mm_load_pd1(mem_addr: *const f64) -> f64x2 { f64x2::new(d, d) } -/// Load 2 double-precision (64-bit) floating-point elements from memory into the returned vector -/// in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection -/// exception may be generated. +/// Load 2 double-precision (64-bit) floating-point elements from memory into +/// the returned vector in reverse order. `mem_addr` must be aligned on a +/// 16-byte boundary or a general-protection exception may be generated. #[inline(always)] #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(movapd))] @@ -1967,9 +2010,9 @@ pub unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> f64x2 { simd_shuffle2(a, a, [1, 0]) } -/// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) -/// from memory into the returned vector. mem_addr does not need to be aligned on any particular -/// oundary. +/// Load 128-bits (composed of 2 packed double-precision (64-bit) +/// floating-point elements) from memory into the returned vector. +/// `mem_addr` does not need to be aligned on any particular boundary. #[inline(always)] #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(movups))] @@ -1978,7 +2021,8 @@ pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> f64x2 { ptr::copy_nonoverlapping( mem_addr as *const u8, &mut dst as *mut f64x2 as *mut u8, - mem::size_of::()); + mem::size_of::(), + ); dst } @@ -1997,7 +2041,7 @@ pub unsafe fn _mm_undefined_si128() -> __m128i { } #[allow(improper_ctypes)] -extern { +extern "C" { #[link_name = "llvm.x86.sse2.pause"] fn pause(); #[link_name = "llvm.x86.sse2.clflush"] @@ -2145,7 +2189,7 @@ extern { #[link_name = "llvm.x86.sse2.cvtsd2ss"] fn cvtsd2ss(a: f32x4, b: f64x2) -> f32x4; #[link_name = "llvm.x86.sse2.cvtss2sd"] - fn cvtss2sd(a: f64x2, b: f32x4 ) -> f64x2; + fn cvtss2sd(a: f64x2, b: f32x4) -> f64x2; #[link_name = "llvm.x86.sse2.cvttpd2dq"] fn cvttpd2dq(a: f64x2) -> i32x4; #[link_name = "llvm.x86.sse2.cvttsd2si"] @@ -2160,7 +2204,7 @@ extern { mod tests { use std::os::raw::c_void; use stdsimd_test::simd_test; - use test::black_box; // Used to inhibit constant-folding. + use test::black_box; // Used to inhibit constant-folding. use v128::*; use x86::{__m128i, sse2}; @@ -2188,13 +2232,17 @@ mod tests { #[simd_test = "sse2"] unsafe fn _mm_add_epi8() { - let a = i8x16::new( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let a = + i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + #[cfg_attr(rustfmt, rustfmt_skip)] let b = i8x16::new( - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ); let r = sse2::_mm_add_epi8(a, b); + #[cfg_attr(rustfmt, rustfmt_skip)] let e = i8x16::new( - 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46); + 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, + ); assert_eq!(r, e); } @@ -2235,13 +2283,17 @@ mod tests { #[simd_test = "sse2"] unsafe fn _mm_adds_epi8() { - let a = i8x16::new( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let a = + i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + #[cfg_attr(rustfmt, rustfmt_skip)] let b = i8x16::new( - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ); let r = sse2::_mm_adds_epi8(a, b); + #[cfg_attr(rustfmt, rustfmt_skip)] let e = i8x16::new( - 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46); + 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, + ); assert_eq!(r, e); } @@ -2288,13 +2340,17 @@ mod tests { #[simd_test = "sse2"] unsafe fn _mm_adds_epu8() { - let a = u8x16::new( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let a = + u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + #[cfg_attr(rustfmt, rustfmt_skip)] let b = u8x16::new( - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ); let r = sse2::_mm_adds_epu8(a, b); + #[cfg_attr(rustfmt, rustfmt_skip)] let e = u8x16::new( - 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46); + 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, + ); assert_eq!(r, e); } @@ -2410,12 +2466,11 @@ mod tests { #[simd_test = "sse2"] unsafe fn _mm_sad_epu8() { + #[cfg_attr(rustfmt, rustfmt_skip)] let a = u8x16::new( - 255, 254, 253, 252, 1, 2, 3, 4, - 155, 154, 153, 152, 1, 2, 3, 4); - let b = u8x16::new( - 0, 0, 0, 0, 2, 1, 2, 1, - 1, 1, 1, 1, 1, 2, 1, 2); + 255, 254, 253, 252, 1, 2, 3, 4, 155, 154, 153, 152, 1, 2, 3, 4, + ); + let b = u8x16::new(0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2); let r = sse2::_mm_sad_epu8(a, b); let e = u64x2::new(1020, 614); assert_eq!(r, e); @@ -2527,44 +2582,58 @@ mod tests { #[simd_test = "sse2"] unsafe fn _mm_slli_si128() { + #[cfg_attr(rustfmt, rustfmt_skip)] let a = __m128i::new( - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + ); let r = sse2::_mm_slli_si128(a, 1); - let e = __m128i::new( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let e = + __m128i::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); assert_eq!(r, e); + #[cfg_attr(rustfmt, rustfmt_skip)] let a = __m128i::new( - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + ); let r = sse2::_mm_slli_si128(a, 15); - let e = __m128i::new( - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1); + let e = __m128i::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1); assert_eq!(r, e); + #[cfg_attr(rustfmt, rustfmt_skip)] let a = __m128i::new( - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + ); let r = sse2::_mm_slli_si128(a, 16); assert_eq!(r, __m128i::splat(0)); + #[cfg_attr(rustfmt, rustfmt_skip)] let a = __m128i::new( - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + ); let r = sse2::_mm_slli_si128(a, -1); assert_eq!(r, __m128i::splat(0)); + #[cfg_attr(rustfmt, rustfmt_skip)] let a = __m128i::new( - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + ); let r = sse2::_mm_slli_si128(a, -0x80000000); assert_eq!(r, __m128i::splat(0)); } #[simd_test = "sse2"] unsafe fn _mm_slli_epi16() { + #[cfg_attr(rustfmt, rustfmt_skip)] let a = i16x8::new( - 0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0); + 0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0, + ); let r = sse2::_mm_slli_epi16(a, 4); + + #[cfg_attr(rustfmt, rustfmt_skip)] let e = i16x8::new( - 0xFFF0 as u16 as i16, - 0xFFF0 as u16 as i16, 0x0FF0, 0x00F0, 0, 0, 0, 0); + 0xFFF0 as u16 as i16, 0xFFF0 as u16 as i16, 0x0FF0, 0x00F0, + 0, 0, 0, 0, + ); assert_eq!(r, e); } @@ -2635,44 +2704,58 @@ mod tests { #[simd_test = "sse2"] unsafe fn _mm_srli_si128() { + #[cfg_attr(rustfmt, rustfmt_skip)] let a = __m128i::new( - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + ); let r = sse2::_mm_srli_si128(a, 1); + #[cfg_attr(rustfmt, rustfmt_skip)] let e = __m128i::new( - 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0); + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, + ); assert_eq!(r, e); + #[cfg_attr(rustfmt, rustfmt_skip)] let a = __m128i::new( - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + ); let r = sse2::_mm_srli_si128(a, 15); - let e = __m128i::new( - 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + let e = __m128i::new(16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); assert_eq!(r, e); + #[cfg_attr(rustfmt, rustfmt_skip)] let a = __m128i::new( - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + ); let r = sse2::_mm_srli_si128(a, 16); assert_eq!(r, __m128i::splat(0)); + #[cfg_attr(rustfmt, rustfmt_skip)] let a = __m128i::new( - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + ); let r = sse2::_mm_srli_si128(a, -1); assert_eq!(r, __m128i::splat(0)); + #[cfg_attr(rustfmt, rustfmt_skip)] let a = __m128i::new( - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + ); let r = sse2::_mm_srli_si128(a, -0x80000000); assert_eq!(r, __m128i::splat(0)); } #[simd_test = "sse2"] unsafe fn _mm_srli_epi16() { + #[cfg_attr(rustfmt, rustfmt_skip)] let a = i16x8::new( - 0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0); + 0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0, + ); let r = sse2::_mm_srli_epi16(a, 4); + #[cfg_attr(rustfmt, rustfmt_skip)] let e = i16x8::new( - 0xFFF as u16 as i16, - 0xFF as u16 as i16, 0xF, 0, 0, 0, 0, 0); + 0xFFF as u16 as i16, 0xFF as u16 as i16, 0xF, 0, 0, 0, 0, 0, + ); assert_eq!(r, e); } @@ -2747,13 +2830,18 @@ mod tests { #[simd_test = "sse2"] unsafe fn _mm_cmpeq_epi8() { - let a = i8x16::new( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let b = i8x16::new( - 15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + let a = + i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let b = + i8x16::new(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); let r = sse2::_mm_cmpeq_epi8(a, b); - assert_eq!(r, i8x16::new( - 0, 0, 0xFFu8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); + assert_eq!( + r, + #[cfg_attr(rustfmt, rustfmt_skip)] + i8x16::new( + 0, 0, 0xFFu8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + ) + ); } #[simd_test = "sse2"] @@ -2902,18 +2990,12 @@ mod tests { #[simd_test = "sse2"] unsafe fn _mm_set_epi8() { + #[cfg_attr(rustfmt, rustfmt_skip)] let r = sse2::_mm_set_epi8( - 0, 1, 2, 3, - 4, 5, 6, 7, - 8, 9, 10, 11, - 12, 13, 14, 15, - ); - let e = i8x16::new( - 15, 14, 13, 12, - 11, 10, 9, 8, - 7, 6, 5, 4, - 3, 2, 1, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ); + let e = + i8x16::new(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); assert_eq!(r, e); } @@ -2955,18 +3037,12 @@ mod tests { #[simd_test = "sse2"] unsafe fn _mm_setr_epi8() { + #[cfg_attr(rustfmt, rustfmt_skip)] let r = sse2::_mm_setr_epi8( - 0, 1, 2, 3, - 4, 5, 6, 7, - 8, 9, 10, 11, - 12, 13, 14, 15, - ); - let e = i8x16::new( - 0, 1, 2, 3, - 4, 5, 6, 7, - 8, 9, 10, 11, - 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ); + let e = + i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); assert_eq!(r, e); } @@ -3042,9 +3118,13 @@ mod tests { let a = i16x8::new(0x80, -0x81, 0, 0, 0, 0, 0, 0); let b = i16x8::new(0, 0, 0, 0, 0, 0, -0x81, 0x80); let r = sse2::_mm_packs_epi16(a, b); - assert_eq!(r, i8x16::new( - 0x7F, -0x80, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, -0x80, 0x7F)); + assert_eq!( + r, + #[cfg_attr(rustfmt, rustfmt_skip)] + i8x16::new( + 0x7F, -0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0x80, 0x7F + ) + ); } #[simd_test = "sse2"] @@ -3053,7 +3133,9 @@ mod tests { let b = i32x4::new(0, 0, -0x8001, 0x8000); let r = sse2::_mm_packs_epi32(a, b); assert_eq!( - r, i16x8::new(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF)); + r, + i16x8::new(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF) + ); } #[simd_test = "sse2"] @@ -3061,9 +3143,10 @@ mod tests { let a = i16x8::new(0x100, -1, 0, 0, 0, 0, 0, 0); let b = i16x8::new(0, 0, 0, 0, 0, 0, -1, 0x100); let r = sse2::_mm_packus_epi16(a, b); - assert_eq!(r, u8x16::new( - 0xFF, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0xFF)); + assert_eq!( + r, + u8x16::new(0xFF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF) + ); } #[simd_test = "sse2"] @@ -3082,9 +3165,9 @@ mod tests { #[simd_test = "sse2"] unsafe fn _mm_movemask_epi8() { - let a = i8x16::from(u8x16::new( + let a = i8x16::from(#[cfg_attr(rustfmt, rustfmt_skip)] u8x16::new( 0b1000_0000, 0b0, 0b1000_0000, 0b01, 0b0101, 0b1111_0000, 0, 0, - 0, 0, 0b1111_0000, 0b0101, 0b01, 0b1000_0000, 0b0, 0b1000_0000)); + 0, 0, 0b1111_0000, 0b0101, 0b01, 0b1000_0000, 0b0, 0b1000_0000, )); let r = sse2::_mm_movemask_epi8(a); assert_eq!(r, 0b10100100_00100101); } @@ -3115,13 +3198,17 @@ mod tests { #[simd_test = "sse2"] unsafe fn _mm_unpackhi_epi8() { - let a = i8x16::new( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let a = + i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + #[cfg_attr(rustfmt, rustfmt_skip)] let b = i8x16::new( - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ); let r = sse2::_mm_unpackhi_epi8(a, b); + #[cfg_attr(rustfmt, rustfmt_skip)] let e = i8x16::new( - 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31, + ); assert_eq!(r, e); } @@ -3154,13 +3241,15 @@ mod tests { #[simd_test = "sse2"] unsafe fn _mm_unpacklo_epi8() { - let a = i8x16::new( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let a = + i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + #[cfg_attr(rustfmt, rustfmt_skip)] let b = i8x16::new( - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ); let r = sse2::_mm_unpacklo_epi8(a, b); - let e = i8x16::new( - 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); + let e = + i8x16::new(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); assert_eq!(r, e); } @@ -3825,7 +3914,7 @@ mod tests { #[simd_test = "sse2"] unsafe fn _mm_cvtpd_ps() { - use std::{f64,f32}; + use std::{f32, f64}; let r = sse2::_mm_cvtpd_ps(f64x2::new(-1.0, 5.0)); assert_eq!(r, f32x4::new(-1.0, 5.0, 0.0, 0.0)); @@ -3834,20 +3923,23 @@ mod tests { assert_eq!(r, f32x4::new(-1.0, -5.0, 0.0, 0.0)); let r = sse2::_mm_cvtpd_ps(f64x2::new(f64::MAX, f64::MIN)); - assert_eq!(r, f32x4::new(f32::INFINITY, f32::NEG_INFINITY, 0.0,0.0)); + assert_eq!(r, f32x4::new(f32::INFINITY, f32::NEG_INFINITY, 0.0, 0.0)); - let r = sse2::_mm_cvtpd_ps(f64x2::new(f32::MAX as f64, f32::MIN as f64)); - assert_eq!(r, f32x4::new(f32::MAX, f32::MIN, 0.0,0.0)); + let r = + sse2::_mm_cvtpd_ps(f64x2::new(f32::MAX as f64, f32::MIN as f64)); + assert_eq!(r, f32x4::new(f32::MAX, f32::MIN, 0.0, 0.0)); } #[simd_test = "sse2"] unsafe fn _mm_cvtps_pd() { - use std::{f64, f32}; + use std::{f32, f64}; let r = sse2::_mm_cvtps_pd(f32x4::new(-1.0, 2.0, -3.0, 5.0)); assert_eq!(r, f64x2::new(-1.0, 2.0)); - let r = sse2::_mm_cvtps_pd(f32x4::new(f32::MAX, f32::INFINITY, f32::NEG_INFINITY, f32::MIN)); + let r = sse2::_mm_cvtps_pd( + f32x4::new(f32::MAX, f32::INFINITY, f32::NEG_INFINITY, f32::MIN), + ); assert_eq!(r, f64x2::new(f32::MAX as f64, f64::INFINITY)); } @@ -3864,7 +3956,9 @@ mod tests { let r = sse2::_mm_cvtpd_epi32(f64x2::new(f64::MAX, f64::MIN)); assert_eq!(r, i32x4::new(i32::MIN, i32::MIN, 0, 0)); - let r = sse2::_mm_cvtpd_epi32(f64x2::new(f64::INFINITY, f64::NEG_INFINITY)); + let r = sse2::_mm_cvtpd_epi32( + f64x2::new(f64::INFINITY, f64::NEG_INFINITY), + ); assert_eq!(r, i32x4::new(i32::MIN, i32::MIN, 0, 0)); let r = sse2::_mm_cvtpd_epi32(f64x2::new(f64::NAN, f64::NAN)); @@ -3902,7 +3996,7 @@ mod tests { #[simd_test = "sse2"] unsafe fn _mm_cvtsd_ss() { - use std::{f64, f32}; + use std::{f32, f64}; let a = f32x4::new(-1.1, -2.2, 3.3, 4.4); let b = f64x2::new(2.0, -5.0); @@ -3911,17 +4005,26 @@ mod tests { assert_eq!(r, f32x4::new(2.0, -2.2, 3.3, 4.4)); - let a = f32x4::new(-1.1, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY); + let a = + f32x4::new(-1.1, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY); let b = f64x2::new(f64::INFINITY, -5.0); let r = sse2::_mm_cvtsd_ss(a, b); - assert_eq!(r, f32x4::new(f32::INFINITY, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY)); + assert_eq!( + r, + f32x4::new( + f32::INFINITY, + f32::NEG_INFINITY, + f32::MAX, + f32::NEG_INFINITY + ) + ); } #[simd_test = "sse2"] unsafe fn _mm_cvtss_sd() { - use std::{f64, f32}; + use std::{f32, f64}; let a = f64x2::new(-1.1, 2.2); let b = f32x4::new(1.0, 2.0, 3.0, 4.0); @@ -3984,7 +4087,8 @@ mod tests { let r = sse2::_mm_cvttps_epi32(a); assert_eq!(r, i32x4::new(-1, 2, -3, 6)); - let a = f32x4::new(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX); + let a = + f32x4::new(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX); let r = sse2::_mm_cvttps_epi32(a); assert_eq!(r, i32x4::new(i32::MIN, i32::MIN, i32::MIN, i32::MIN)); } diff --git a/library/stdarch/src/x86/sse3.rs b/library/stdarch/src/x86/sse3.rs index 21164507f5d4..cba3129d1bcb 100644 --- a/library/stdarch/src/x86/sse3.rs +++ b/library/stdarch/src/x86/sse3.rs @@ -106,7 +106,7 @@ pub unsafe fn _mm_moveldup_ps(a: f32x4) -> f32x4 { } #[allow(improper_ctypes)] -extern { +extern "C" { #[link_name = "llvm.x86.sse3.addsub.ps"] fn addsubps(a: f32x4, b: f32x4) -> f32x4; #[link_name = "llvm.x86.sse3.addsub.pd"] @@ -129,7 +129,7 @@ mod tests { use stdsimd_test::simd_test; use v128::*; - use x86::sse3 as sse3; + use x86::sse3; #[simd_test = "sse3"] unsafe fn _mm_addsub_ps() { @@ -181,7 +181,8 @@ mod tests { #[simd_test = "sse3"] unsafe fn _mm_lddqu_si128() { - let a = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + let a = + i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); let r = sse3::_mm_lddqu_si128(&a); assert_eq!(a, r); } @@ -213,4 +214,4 @@ mod tests { let r = sse3::_mm_loaddup_pd(&d); assert_eq!(r, f64x2::new(d, d)); } -} \ No newline at end of file +} diff --git a/library/stdarch/src/x86/sse41.rs b/library/stdarch/src/x86/sse41.rs index a804ed2e9edc..e3cf9154bd1c 100644 --- a/library/stdarch/src/x86/sse41.rs +++ b/library/stdarch/src/x86/sse41.rs @@ -15,7 +15,7 @@ pub unsafe fn _mm_blendv_epi8(a: i8x16, b: i8x16, mask: i8x16) -> i8x16 { #[inline(always)] #[target_feature = "+sse4.1"] -#[cfg_attr(test, assert_instr(pblendw, imm8=0xF0))] +#[cfg_attr(test, assert_instr(pblendw, imm8 = 0xF0))] pub unsafe fn _mm_blend_epi16(a: i16x8, b: i16x8, imm8: u8) -> i16x8 { macro_rules! call { ($imm8:expr) => { pblendw(a, b, $imm8) } @@ -23,7 +23,8 @@ pub unsafe fn _mm_blend_epi16(a: i16x8, b: i16x8, imm8: u8) -> i16x8 { constify_imm8!(imm8, call) } -/// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask` +/// Blend packed double-precision (64-bit) floating-point elements from `a` +/// and `b` using `mask` #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(blendvpd))] @@ -31,7 +32,8 @@ pub unsafe fn _mm_blendv_pd(a: f64x2, b: f64x2, mask: f64x2) -> f64x2 { blendvpd(a, b, mask) } -/// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using `mask` +/// Blend packed single-precision (32-bit) floating-point elements from `a` +/// and `b` using `mask` #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(blendvps))] @@ -39,10 +41,11 @@ pub unsafe fn _mm_blendv_ps(a: f32x4, b: f32x4, mask: f32x4) -> f32x4 { blendvps(a, b, mask) } -/// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control mask `imm2` +/// Blend packed double-precision (64-bit) floating-point elements from `a` +/// and `b` using control mask `imm2` #[inline(always)] #[target_feature = "+sse4.1"] -#[cfg_attr(test, assert_instr(blendpd, imm2=0b10))] +#[cfg_attr(test, assert_instr(blendpd, imm2 = 0b10))] pub unsafe fn _mm_blend_pd(a: f64x2, b: f64x2, imm2: u8) -> f64x2 { macro_rules! call { ($imm2:expr) => { blendpd(a, b, $imm2) } @@ -50,10 +53,11 @@ pub unsafe fn _mm_blend_pd(a: f64x2, b: f64x2, imm2: u8) -> f64x2 { constify_imm2!(imm2, call) } -/// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using mask `imm4` +/// Blend packed single-precision (32-bit) floating-point elements from `a` +/// and `b` using mask `imm4` #[inline(always)] #[target_feature = "+sse4.1"] -#[cfg_attr(test, assert_instr(blendps, imm4=0b0101))] +#[cfg_attr(test, assert_instr(blendps, imm4 = 0b0101))] pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: u8) -> f32x4 { macro_rules! call { ($imm4:expr) => { blendps(a, b, $imm4) } @@ -61,11 +65,12 @@ pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: u8) -> f32x4 { constify_imm4!(imm4, call) } -/// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8` +/// Extract a single-precision (32-bit) floating-point element from `a`, +/// selected with `imm8` #[inline(always)] #[target_feature = "+sse4.1"] // TODO: Add test for Windows -#[cfg_attr(all(test, not(windows)), assert_instr(extractps, imm8=0))] +#[cfg_attr(all(test, not(windows)), assert_instr(extractps, imm8 = 0))] pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> i32 { mem::transmute(a.extract(imm8 as u32 & 0b11)) } @@ -73,7 +78,7 @@ pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> i32 { /// Extract an 8-bit integer from `a` selected with `imm8` #[inline(always)] #[target_feature = "+sse4.1"] -#[cfg_attr(test, assert_instr(pextrb, imm8=0))] +#[cfg_attr(test, assert_instr(pextrb, imm8 = 0))] pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: u8) -> i8 { a.extract((imm8 & 0b1111) as u32) } @@ -82,7 +87,7 @@ pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: u8) -> i8 { #[inline(always)] #[target_feature = "+sse4.1"] // TODO: Add test for Windows -#[cfg_attr(all(test, not(windows)), assert_instr(pextrd, imm8=1))] +#[cfg_attr(all(test, not(windows)), assert_instr(pextrd, imm8 = 1))] pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: u8) -> i32 { a.extract((imm8 & 0b11) as u32) } @@ -92,15 +97,16 @@ pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: u8) -> i32 { #[inline(always)] #[target_feature = "+sse4.1"] // TODO: Add test for Windows -#[cfg_attr(all(test, not(windows)), assert_instr(pextrq, imm8=1))] +#[cfg_attr(all(test, not(windows)), assert_instr(pextrq, imm8 = 1))] pub unsafe fn _mm_extract_epi64(a: i64x2, imm8: u8) -> i64 { a.extract((imm8 & 0b1) as u32) } -/// Select a single value in `a` to store at some position in `b`, +/// Select a single value in `a` to store at some position in `b`, /// Then zero elements according to `imm8`. -/// -/// `imm8` specifies which bits from operand `a` will be copied, which bits in the +/// +/// `imm8` specifies which bits from operand `a` will be copied, which bits in +/// the /// result they will be copied to, and which bits in the result will be /// cleared. The following assignments are made: /// @@ -121,7 +127,7 @@ pub unsafe fn _mm_extract_epi64(a: i64x2, imm8: u8) -> i64 { /// element is cleared. #[inline(always)] #[target_feature = "+sse4.1"] -#[cfg_attr(test, assert_instr(insertps, imm8=0b1010))] +#[cfg_attr(test, assert_instr(insertps, imm8 = 0b1010))] pub unsafe fn _mm_insert_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 { macro_rules! call { ($imm8:expr) => { insertps(a, b, $imm8) } @@ -129,59 +135,66 @@ pub unsafe fn _mm_insert_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 { constify_imm8!(imm8, call) } -/// Return a copy of `a` with the 8-bit integer from `i` inserted at a location specified by `imm8`. +/// Return a copy of `a` with the 8-bit integer from `i` inserted at a +/// location specified by `imm8`. #[inline(always)] #[target_feature = "+sse4.1"] -#[cfg_attr(test, assert_instr(pinsrb, imm8=0))] +#[cfg_attr(test, assert_instr(pinsrb, imm8 = 0))] pub unsafe fn _mm_insert_epi8(a: i8x16, i: i8, imm8: u8) -> i8x16 { a.replace((imm8 & 0b1111) as u32, i) } -/// Return a copy of `a` with the 32-bit integer from `i` inserted at a location specified by `imm8`. +/// Return a copy of `a` with the 32-bit integer from `i` inserted at a +/// location specified by `imm8`. #[inline(always)] #[target_feature = "+sse4.1"] -#[cfg_attr(test, assert_instr(pinsrd, imm8=0))] +#[cfg_attr(test, assert_instr(pinsrd, imm8 = 0))] pub unsafe fn _mm_insert_epi32(a: i32x4, i: i32, imm8: u8) -> i32x4 { a.replace((imm8 & 0b11) as u32, i) } -/// Return a copy of `a` with the 64-bit integer from `i` inserted at a location specified by `imm8`. +/// Return a copy of `a` with the 64-bit integer from `i` inserted at a +/// location specified by `imm8`. #[cfg(target_arch = "x86_64")] #[inline(always)] #[target_feature = "+sse4.1"] -#[cfg_attr(test, assert_instr(pinsrq, imm8=0))] +#[cfg_attr(test, assert_instr(pinsrq, imm8 = 0))] pub unsafe fn _mm_insert_epi64(a: i64x2, i: i64, imm8: u8) -> i64x2 { a.replace((imm8 & 0b1) as u32, i) } -/// Compare packed 8-bit integers in `a` and `b`,87 and return packed maximum values in dst. +/// Compare packed 8-bit integers in `a` and `b`,87 and return packed maximum +/// values in dst. #[inline(always)] #[target_feature = "+sse4.1"] -#[cfg_attr(test, assert_instr(pmaxsb, imm8=0))] +#[cfg_attr(test, assert_instr(pmaxsb, imm8 = 0))] pub unsafe fn _mm_max_epi8(a: i8x16, b: i8x16) -> i8x16 { pmaxsb(a, b) } -/// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed maximum. +/// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed +/// maximum. #[inline(always)] #[target_feature = "+sse4.1"] -#[cfg_attr(test, assert_instr(pmaxuw, imm8=0))] +#[cfg_attr(test, assert_instr(pmaxuw, imm8 = 0))] pub unsafe fn _mm_max_epu16(a: u16x8, b: u16x8) -> u16x8 { pmaxuw(a, b) } -// Compare packed 32-bit integers in `a` and `b`, and return packed maximum values. +// Compare packed 32-bit integers in `a` and `b`, and return packed maximum +// values. #[inline(always)] #[target_feature = "+sse4.1"] -#[cfg_attr(test, assert_instr(pmaxsd, imm8=0))] +#[cfg_attr(test, assert_instr(pmaxsd, imm8 = 0))] pub unsafe fn _mm_max_epi32(a: i32x4, b: i32x4) -> i32x4 { pmaxsd(a, b) } -// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed maximum values. +// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed +// maximum values. #[inline(always)] #[target_feature = "+sse4.1"] -#[cfg_attr(test, assert_instr(pmaxud, imm8=0))] +#[cfg_attr(test, assert_instr(pmaxud, imm8 = 0))] pub unsafe fn _mm_max_epu32(a: u32x4, b: u32x4) -> u32x4 { pmaxud(a, b) } @@ -221,7 +234,7 @@ pub unsafe fn _mm_dp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 { } #[allow(improper_ctypes)] -extern { +extern "C" { #[link_name = "llvm.x86.sse41.pblendvb"] fn pblendvb(a: i8x16, b: i8x16, mask: i8x16) -> i8x16; #[link_name = "llvm.x86.sse41.blendvpd"] @@ -261,14 +274,18 @@ mod tests { #[simd_test = "sse4.1"] unsafe fn _mm_blendv_epi8() { - let a = i8x16::new( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let a = + i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + #[cfg_attr(rustfmt, rustfmt_skip)] let b = i8x16::new( - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); - let mask = i8x16::new( - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ); + let mask = + i8x16::new(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); + #[cfg_attr(rustfmt, rustfmt_skip)] let e = i8x16::new( - 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31); + 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31, + ); assert_eq!(sse41::_mm_blendv_epi8(a, b, mask), e); } @@ -286,7 +303,7 @@ mod tests { unsafe fn _mm_blendv_ps() { let a = f32x4::splat(0.0); let b = f32x4::splat(1.0); - let mask = mem::transmute(i32x4::new(0,-1, 0, -1)); + let mask = mem::transmute(i32x4::new(0, -1, 0, -1)); let r = sse41::_mm_blendv_ps(a, b, mask); let e = f32x4::new(0.0, 1.0, 0.0, 1.0); assert_eq!(r, e); @@ -330,7 +347,8 @@ mod tests { #[simd_test = "sse4.1"] unsafe fn _mm_extract_epi8() { - let a = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let a = + i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let r = sse41::_mm_extract_epi8(a, 1); assert_eq!(r, 1); let r = sse41::_mm_extract_epi8(a, 17); @@ -398,10 +416,22 @@ mod tests { #[simd_test = "sse4.1"] unsafe fn _mm_max_epi8() { - let a = i8x16::new(1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29, 32); - let b = i8x16::new(2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31); + #[cfg_attr(rustfmt, rustfmt_skip)] + let a = i8x16::new( + 1, 4, 5, 8, 9, 12, 13, 16, + 17, 20, 21, 24, 25, 28, 29, 32, + ); + #[cfg_attr(rustfmt, rustfmt_skip)] + let b = i8x16::new( + 2, 3, 6, 7, 10, 11, 14, 15, + 18, 19, 22, 23, 26, 27, 30, 31, + ); let r = sse41::_mm_max_epi8(a, b); - let e = i8x16::new(2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32); + #[cfg_attr(rustfmt, rustfmt_skip)] + let e = i8x16::new( + 2, 4, 6, 8, 10, 12, 14, 16, + 18, 20, 22, 24, 26, 28, 30, 32, + ); assert_eq!(r, e); } diff --git a/library/stdarch/src/x86/sse42.rs b/library/stdarch/src/x86/sse42.rs index cc5a827d35f6..eb551684c9a2 100644 --- a/library/stdarch/src/x86/sse42.rs +++ b/library/stdarch/src/x86/sse42.rs @@ -15,7 +15,8 @@ pub const _SIDD_SWORD_OPS: i8 = 0b00000011; /// For each character in `a`, find if it is in `b` *(Default)* pub const _SIDD_CMP_EQUAL_ANY: i8 = 0b00000000; -/// For each character in `a`, determine if `b[0] <= c <= b[1] or b[1] <= c <= b[2]...` +/// For each character in `a`, determine if `b[0] <= c <= b[1] or b[1] <= c <= +/// b[2]...` pub const _SIDD_CMP_RANGES: i8 = 0b00000100; /// The strings defined by `a` and `b` are equal pub const _SIDD_CMP_EQUAL_EACH: i8 = 0b00001000; @@ -46,11 +47,7 @@ pub const _SIDD_UNIT_MASK: i8 = 0b01000000; #[inline(always)] #[target_feature = "+sse4.2"] #[cfg_attr(test, assert_instr(pcmpistrm, imm8 = 0))] -pub unsafe fn _mm_cmpistrm( - a: __m128i, - b: __m128i, - imm8: i8, -) -> u8x16 { +pub unsafe fn _mm_cmpistrm(a: __m128i, b: __m128i, imm8: i8) -> u8x16 { macro_rules! call { ($imm8:expr) => { pcmpistrm128(a, b, $imm8) } } @@ -58,9 +55,9 @@ pub unsafe fn _mm_cmpistrm( } /// Compare packed strings with implicit lengths in `a` and `b` using the -/// control in `imm8`, and return the generated index. Similar to [`_mm_cmpestri`] -/// with the excception that [`_mm_cmpestri`] requires the lengths of `a` and -/// `b` to be explicitly specified. +/// control in `imm8`, and return the generated index. Similar to +/// [`_mm_cmpestri`] with the excception that [`_mm_cmpestri`] requires the +/// lengths of `a` and `b` to be explicitly specified. /// /// # Control modes /// @@ -105,7 +102,8 @@ pub unsafe fn _mm_cmpistrm( /// use stdsimd::simd::u8x16; /// use stdsimd::vendor::{__m128i, _mm_cmpistri, _SIDD_CMP_EQUAL_ORDERED}; /// -/// let haystack = b"This is a long string of text data\r\n\tthat extends multiple lines"; +/// let haystack = b"This is a long string of text data\r\n\tthat extends +/// multiple lines"; /// let needle = b"\r\n\t\0\0\0\0\0\0\0\0\0\0\0\0\0"; /// /// let a = __m128i::from(u8x16::load(needle, 0)); @@ -171,8 +169,8 @@ pub unsafe fn _mm_cmpistrm( /// # } /// ``` /// -/// Find the index of the first character in the haystack that is within a range -/// of characters. +/// Find the index of the first character in the haystack that is within a +/// range of characters. /// /// ``` /// # #![feature(cfg_target_feature)] @@ -269,11 +267,7 @@ pub unsafe fn _mm_cmpistrm( #[inline(always)] #[target_feature = "+sse4.2"] #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))] -pub unsafe fn _mm_cmpistri( - a: __m128i, - b: __m128i, - imm8: i8, -) -> i32 { +pub unsafe fn _mm_cmpistri(a: __m128i, b: __m128i, imm8: i8) -> i32 { macro_rules! call { ($imm8:expr) => { pcmpistri128(a, b, $imm8) } } @@ -286,11 +280,7 @@ pub unsafe fn _mm_cmpistri( #[inline(always)] #[target_feature = "+sse4.2"] #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))] -pub unsafe fn _mm_cmpistrz( - a: __m128i, - b: __m128i, - imm8: i8, -) -> i32 { +pub unsafe fn _mm_cmpistrz(a: __m128i, b: __m128i, imm8: i8) -> i32 { macro_rules! call { ($imm8:expr) => { pcmpistriz128(a, b, $imm8) } } @@ -303,11 +293,7 @@ pub unsafe fn _mm_cmpistrz( #[inline(always)] #[target_feature = "+sse4.2"] #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))] -pub unsafe fn _mm_cmpistrc( - a: __m128i, - b: __m128i, - imm8: i8, -) -> i32 { +pub unsafe fn _mm_cmpistrc(a: __m128i, b: __m128i, imm8: i8) -> i32 { macro_rules! call { ($imm8:expr) => { pcmpistric128(a, b, $imm8) } } @@ -320,11 +306,7 @@ pub unsafe fn _mm_cmpistrc( #[inline(always)] #[target_feature = "+sse4.2"] #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))] -pub unsafe fn _mm_cmpistrs( - a: __m128i, - b: __m128i, - imm8: i8, -) -> i32 { +pub unsafe fn _mm_cmpistrs(a: __m128i, b: __m128i, imm8: i8) -> i32 { macro_rules! call { ($imm8:expr) => { pcmpistris128(a, b, $imm8) } } @@ -336,11 +318,7 @@ pub unsafe fn _mm_cmpistrs( #[inline(always)] #[target_feature = "+sse4.2"] #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))] -pub unsafe fn _mm_cmpistro( - a: __m128i, - b: __m128i, - imm8: i8, -) -> i32 { +pub unsafe fn _mm_cmpistro(a: __m128i, b: __m128i, imm8: i8) -> i32 { macro_rules! call { ($imm8:expr) => { pcmpistrio128(a, b, $imm8) } } @@ -353,11 +331,7 @@ pub unsafe fn _mm_cmpistro( #[inline(always)] #[target_feature = "+sse4.2"] #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))] -pub unsafe fn _mm_cmpistra( - a: __m128i, - b: __m128i, - imm8: i8, -) -> i32 { +pub unsafe fn _mm_cmpistra(a: __m128i, b: __m128i, imm8: i8) -> i32 { macro_rules! call { ($imm8:expr) => { pcmpistria128(a, b, $imm8) } } @@ -370,11 +344,7 @@ pub unsafe fn _mm_cmpistra( #[target_feature = "+sse4.2"] #[cfg_attr(test, assert_instr(pcmpestrm, imm8 = 0))] pub unsafe fn _mm_cmpestrm( - a: __m128i, - la: i32, - b: __m128i, - lb: i32, - imm8: i8, + a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8 ) -> u8x16 { macro_rules! call { ($imm8:expr) => { pcmpestrm128(a, la, b, lb, $imm8) } @@ -383,9 +353,9 @@ pub unsafe fn _mm_cmpestrm( } /// Compare packed strings `a` and `b` with lengths `la` and `lb` using the -/// control in `imm8`, and return the generated index. Similar to [`_mm_cmpistri`] -/// with the excception that [`_mm_cmpistri`] implicityly determines the length of -/// `a` and `b`. +/// control in `imm8`, and return the generated index. Similar to +/// [`_mm_cmpistri`] with the excception that [`_mm_cmpistri`] implicityly +/// determines the length of `a` and `b`. /// /// # Control modes /// @@ -468,11 +438,7 @@ pub unsafe fn _mm_cmpestrm( #[target_feature = "+sse4.2"] #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))] pub unsafe fn _mm_cmpestri( - a: __m128i, - la: i32, - b: __m128i, - lb: i32, - imm8: i8, + a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8 ) -> i32 { macro_rules! call { ($imm8:expr) => { pcmpestri128(a, la, b, lb, $imm8) } @@ -487,11 +453,7 @@ pub unsafe fn _mm_cmpestri( #[target_feature = "+sse4.2"] #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))] pub unsafe fn _mm_cmpestrz( - a: __m128i, - la: i32, - b: __m128i, - lb: i32, - imm8: i8, + a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8 ) -> i32 { macro_rules! call { ($imm8:expr) => { pcmpestriz128(a, la, b, lb, $imm8) } @@ -506,11 +468,7 @@ pub unsafe fn _mm_cmpestrz( #[target_feature = "+sse4.2"] #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))] pub unsafe fn _mm_cmpestrc( - a: __m128i, - la: i32, - b: __m128i, - lb: i32, - imm8: i8, + a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8 ) -> i32 { macro_rules! call { ($imm8:expr) => { pcmpestric128(a, la, b, lb, $imm8) } @@ -525,11 +483,7 @@ pub unsafe fn _mm_cmpestrc( #[target_feature = "+sse4.2"] #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))] pub unsafe fn _mm_cmpestrs( - a: __m128i, - la: i32, - b: __m128i, - lb: i32, - imm8: i8, + a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8 ) -> i32 { macro_rules! call { ($imm8:expr) => { pcmpestris128(a, la, b, lb, $imm8) } @@ -544,11 +498,7 @@ pub unsafe fn _mm_cmpestrs( #[target_feature = "+sse4.2"] #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))] pub unsafe fn _mm_cmpestro( - a: __m128i, - la: i32, - b: __m128i, - lb: i32, - imm8: i8, + a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8 ) -> i32 { macro_rules! call { ($imm8:expr) => { pcmpestrio128(a, la, b, lb, $imm8) } @@ -564,11 +514,7 @@ pub unsafe fn _mm_cmpestro( #[target_feature = "+sse4.2"] #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))] pub unsafe fn _mm_cmpestra( - a: __m128i, - la: i32, - b: __m128i, - lb: i32, - imm8: i8, + a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8 ) -> i32 { macro_rules! call { ($imm8:expr) => { pcmpestria128(a, la, b, lb, $imm8) } @@ -624,22 +570,35 @@ pub unsafe fn _mm_cmpgt_epi64(a: i64x2, b: i64x2) -> i64x2 { } #[allow(improper_ctypes)] -extern { +extern "C" { // SSE 4.2 string and text comparison ops #[link_name = "llvm.x86.sse42.pcmpestrm128"] - fn pcmpestrm128(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8) -> u8x16; + fn pcmpestrm128( + a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8 + ) -> u8x16; #[link_name = "llvm.x86.sse42.pcmpestri128"] - fn pcmpestri128(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8) -> i32; + fn pcmpestri128(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8) + -> i32; #[link_name = "llvm.x86.sse42.pcmpestriz128"] - fn pcmpestriz128(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8) -> i32; + fn pcmpestriz128( + a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8 + ) -> i32; #[link_name = "llvm.x86.sse42.pcmpestric128"] - fn pcmpestric128(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8) -> i32; + fn pcmpestric128( + a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8 + ) -> i32; #[link_name = "llvm.x86.sse42.pcmpestris128"] - fn pcmpestris128(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8) -> i32; + fn pcmpestris128( + a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8 + ) -> i32; #[link_name = "llvm.x86.sse42.pcmpestrio128"] - fn pcmpestrio128(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8) -> i32; + fn pcmpestrio128( + a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8 + ) -> i32; #[link_name = "llvm.x86.sse42.pcmpestria128"] - fn pcmpestria128(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8) -> i32; + fn pcmpestria128( + a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8 + ) -> i32; #[link_name = "llvm.x86.sse42.pcmpistrm128"] fn pcmpistrm128(a: __m128i, b: __m128i, imm8: i8) -> u8x16; #[link_name = "llvm.x86.sse42.pcmpistri128"] @@ -685,7 +644,8 @@ mod tests { ptr::copy_nonoverlapping( s.get_unchecked(0) as *const u8 as *const u8, slice.get_unchecked_mut(0) as *mut u8 as *mut u8, - s.len()); + s.len(), + ); __m128i::from(u8x16::load(slice, 0)) } @@ -694,8 +654,11 @@ mod tests { let a = str_to_m128i(b"Hello! Good-Bye!"); let b = str_to_m128i(b"hello! good-bye!"); let i = sse42::_mm_cmpistrm(a, b, sse42::_SIDD_UNIT_MASK); - let res = u8x16::new(0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, - 0xff, 0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff); + #[cfg_attr(rustfmt, rustfmt_skip)] + let res = u8x16::new( + 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, + 0xff, 0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, + ); assert_eq!(i, res); } @@ -733,14 +696,23 @@ mod tests { #[simd_test = "sse4.2"] unsafe fn _mm_cmpistro() { - let a_bytes = u8x16::new(0x00, 0x47, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, - 0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); - let b_bytes = u8x16::new(0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, - 0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); + #[cfg_attr(rustfmt, rustfmt_skip)] + let a_bytes = u8x16::new( + 0x00, 0x47, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, + 0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ); + #[cfg_attr(rustfmt, rustfmt_skip)] + let b_bytes = u8x16::new( + 0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, + 0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ); let a = __m128i::from(a_bytes); let b = __m128i::from(b_bytes); let i = sse42::_mm_cmpistro( - a, b, sse42::_SIDD_UWORD_OPS | sse42::_SIDD_UNIT_MASK); + a, + b, + sse42::_SIDD_UWORD_OPS | sse42::_SIDD_UNIT_MASK, + ); assert_eq!(0, i); } @@ -757,15 +729,20 @@ mod tests { let a = str_to_m128i(b"Hello!"); let b = str_to_m128i(b"Hello."); let i = sse42::_mm_cmpestrm(a, 5, b, 5, sse42::_SIDD_UNIT_MASK); - assert_eq!(i, u8x16::new(0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00)); + #[cfg_attr(rustfmt, rustfmt_skip)] + let r = u8x16::new( + 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + ); + assert_eq!(i, r); } #[simd_test = "sse4.2"] unsafe fn _mm_cmpestri() { let a = str_to_m128i(b"bar - garbage"); let b = str_to_m128i(b"foobar"); - let i = sse42::_mm_cmpestri(a, 3, b, 6, sse42::_SIDD_CMP_EQUAL_ORDERED); + let i = + sse42::_mm_cmpestri(a, 3, b, 6, sse42::_SIDD_CMP_EQUAL_ORDERED); assert_eq!(3, i); } @@ -773,8 +750,8 @@ mod tests { unsafe fn _mm_cmpestrz() { let a = str_to_m128i(b""); let b = str_to_m128i(b"Hello"); - let i = sse42::_mm_cmpestrz( - a, 16, b, 6, sse42::_SIDD_CMP_EQUAL_ORDERED); + let i = + sse42::_mm_cmpestrz(a, 16, b, 6, sse42::_SIDD_CMP_EQUAL_ORDERED); assert_eq!(1, i); } @@ -782,19 +759,20 @@ mod tests { unsafe fn _mm_cmpestrc() { let va = str_to_m128i(b"!!!!!!!!"); let vb = str_to_m128i(b" "); - let i = sse42::_mm_cmpestrc( - va, 7, vb, 7, sse42::_SIDD_UNIT_MASK); + let i = sse42::_mm_cmpestrc(va, 7, vb, 7, sse42::_SIDD_UNIT_MASK); assert_eq!(0, i); } #[simd_test = "sse4.2"] unsafe fn _mm_cmpestrs() { - let a_bytes = u8x16::new(0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, - 0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); + #[cfg_attr(rustfmt, rustfmt_skip)] + let a_bytes = u8x16::new( + 0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, + 0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ); let a = __m128i::from(a_bytes); let b = __m128i::from(u8x16::splat(0x00)); - let i = sse42::_mm_cmpestrs( - a, 8, b, 0, sse42::_SIDD_UWORD_OPS); + let i = sse42::_mm_cmpestrs(a, 8, b, 0, sse42::_SIDD_UWORD_OPS); assert_eq!(0, i); } @@ -802,8 +780,7 @@ mod tests { unsafe fn _mm_cmpestro() { let a = str_to_m128i(b"Hello"); let b = str_to_m128i(b"World"); - let i = sse42::_mm_cmpestro( - a, 5, b, 5, sse42::_SIDD_UBYTE_OPS); + let i = sse42::_mm_cmpestro(a, 5, b, 5, sse42::_SIDD_UBYTE_OPS); assert_eq!(0, i); } @@ -812,7 +789,12 @@ mod tests { let a = str_to_m128i(b"Cannot match a"); let b = str_to_m128i(b"Null after 14"); let i = sse42::_mm_cmpestra( - a, 14, b, 16, sse42::_SIDD_CMP_EQUAL_EACH | sse42::_SIDD_UNIT_MASK); + a, + 14, + b, + 16, + sse42::_SIDD_CMP_EQUAL_EACH | sse42::_SIDD_UNIT_MASK, + ); assert_eq!(1, i); } diff --git a/library/stdarch/src/x86/ssse3.rs b/library/stdarch/src/x86/ssse3.rs index 111bc8ff99fd..b5c9d3ae9ef6 100644 --- a/library/stdarch/src/x86/ssse3.rs +++ b/library/stdarch/src/x86/ssse3.rs @@ -13,7 +13,8 @@ pub unsafe fn _mm_abs_epi8(a: i8x16) -> u8x16 { pabsb128(a) } -/// Compute the absolute value of each of the packed 16-bit signed integers in `a` and +/// Compute the absolute value of each of the packed 16-bit signed integers in +/// `a` and /// return the 16-bit unsigned integer #[inline(always)] #[target_feature = "+ssse3"] @@ -22,7 +23,8 @@ pub unsafe fn _mm_abs_epi16(a: i16x8) -> u16x8 { pabsw128(a) } -/// Compute the absolute value of each of the packed 32-bit signed integers in `a` and +/// Compute the absolute value of each of the packed 32-bit signed integers in +/// `a` and /// return the 32-bit unsigned integer #[inline(always)] #[target_feature = "+ssse3"] @@ -82,7 +84,9 @@ pub unsafe fn _mm_alignr_epi8(a: i8x16, b: i8x16, n: i32) -> i8x16 { (a, b, n) }; - const fn add(a: u32, b: u32) -> u32 { a + b } + const fn add(a: u32, b: u32) -> u32 { + a + b + } macro_rules! shuffle { ($shift:expr) => { simd_shuffle16(b, a, [ @@ -98,14 +102,22 @@ pub unsafe fn _mm_alignr_epi8(a: i8x16, b: i8x16, n: i32) -> i8x16 { } } match n { - 0 => shuffle!(0), 1 => shuffle!(1), - 2 => shuffle!(2), 3 => shuffle!(3), - 4 => shuffle!(4), 5 => shuffle!(5), - 6 => shuffle!(6), 7 => shuffle!(7), - 8 => shuffle!(8), 9 => shuffle!(9), - 10 => shuffle!(10), 11 => shuffle!(11), - 12 => shuffle!(12), 13 => shuffle!(13), - 14 => shuffle!(14), 15 => shuffle!(15), + 0 => shuffle!(0), + 1 => shuffle!(1), + 2 => shuffle!(2), + 3 => shuffle!(3), + 4 => shuffle!(4), + 5 => shuffle!(5), + 6 => shuffle!(6), + 7 => shuffle!(7), + 8 => shuffle!(8), + 9 => shuffle!(9), + 10 => shuffle!(10), + 11 => shuffle!(11), + 12 => shuffle!(12), + 13 => shuffle!(13), + 14 => shuffle!(14), + 15 => shuffle!(15), _ => shuffle!(16), } } @@ -223,7 +235,7 @@ pub unsafe fn _mm_sign_epi32(a: i32x4, b: i32x4) -> i32x4 { } #[allow(improper_ctypes)] -extern { +extern "C" { #[link_name = "llvm.x86.ssse3.pabs.b.128"] fn pabsb128(a: i8x16) -> u8x16; @@ -275,7 +287,7 @@ mod tests { use stdsimd_test::simd_test; use v128::*; - use x86::ssse3 as ssse3; + use x86::ssse3; #[simd_test = "ssse3"] unsafe fn _mm_abs_epi8() { @@ -297,44 +309,36 @@ mod tests { #[simd_test = "ssse3"] unsafe fn _mm_shuffle_epi8() { - let a = u8x16::new( - 1, 2, 3, 4, - 5, 6, 7, 8, - 9, 10, 11, 12, - 13, 14, 15, 16, - ); - let b = u8x16::new( - 4, 128, 4, 3, - 24, 12, 6, 19, - 12, 5, 5, 10, - 4, 1, 8, 0, - ); - let expected = u8x16::new( - 5, 0, 5, 4, - 9, 13, 7, 4, - 13, 6, 6, 11, - 5, 2, 9, 1, - ); + let a = + u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + let b = + u8x16::new(4, 128, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0); + let expected = + u8x16::new(5, 0, 5, 4, 9, 13, 7, 4, 13, 6, 6, 11, 5, 2, 9, 1); let r = ssse3::_mm_shuffle_epi8(a, b); assert_eq!(r, expected); } #[simd_test = "ssse3"] unsafe fn _mm_alignr_epi8() { - let a = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - let b = i8x16::new(4, 63, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0); + let a = + i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + let b = + i8x16::new(4, 63, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0); let r = ssse3::_mm_alignr_epi8(a, b, 33); assert_eq!(r, i8x16::splat(0)); let r = ssse3::_mm_alignr_epi8(a, b, 17); - let expected = i8x16::new(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0); + let expected = + i8x16::new(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0); assert_eq!(r, expected); let r = ssse3::_mm_alignr_epi8(a, b, 16); assert_eq!(r, a); let r = ssse3::_mm_alignr_epi8(a, b, 15); - let expected = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let expected = + i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); assert_eq!(r, expected); let r = ssse3::_mm_alignr_epi8(a, b, 0); @@ -397,8 +401,10 @@ mod tests { #[simd_test = "ssse3"] unsafe fn _mm_maddubs_epi16() { - let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - let b = i8x16::new(4, 63, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0); + let a = + u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + let b = + i8x16::new(4, 63, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0); let expected = i16x8::new(130, 24, 192, 194, 158, 175, 66, 120); let r = ssse3::_mm_maddubs_epi16(a, b); assert_eq!(r, expected); @@ -415,9 +421,21 @@ mod tests { #[simd_test = "ssse3"] unsafe fn _mm_sign_epi8() { - let a = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -14, -15, 16); - let b = i8x16::new(4, 63, -4, 3, 24, 12, -6, -19, 12, 5, -5, 10, 4, 1, -8, 0); - let expected = i8x16::new(1, 2, -3, 4, 5, 6, -7, -8, 9, 10, -11, 12, 13, -14, 15, 0); + #[cfg_attr(rustfmt, rustfmt_skip)] + let a = i8x16::new( + 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, -14, -15, 16, + ); + #[cfg_attr(rustfmt, rustfmt_skip)] + let b = i8x16::new( + 4, 63, -4, 3, 24, 12, -6, -19, + 12, 5, -5, 10, 4, 1, -8, 0, + ); + #[cfg_attr(rustfmt, rustfmt_skip)] + let expected = i8x16::new( + 1, 2, -3, 4, 5, 6, -7, -8, + 9, 10, -11, 12, 13, -14, 15, 0, + ); let r = ssse3::_mm_sign_epi8(a, b); assert_eq!(r, expected); } diff --git a/library/stdarch/src/x86/tbm.rs b/library/stdarch/src/x86/tbm.rs index d38fc3fa10f4..c6be38adc8d8 100644 --- a/library/stdarch/src/x86/tbm.rs +++ b/library/stdarch/src/x86/tbm.rs @@ -1,16 +1,21 @@ //! Trailing Bit Manipulation (TBM) instruction set. //! //! The reference is [AMD64 Architecture Programmer's Manual, Volume 3: -//! General-Purpose and System -//! Instructions](http://support.amd.com/TechDocs/24594.pdf). +//! General-Purpose and System Instructions][amd64_ref]. //! -//! [Wikipedia](https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#TBM_.28Trailing_Bit_Manipulation.29) -//! provides a quick overview of the available instructions. +//! [Wikipedia][wikipedia_bmi] provides a quick overview of the available +//! instructions. +//! +//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf +//! [wikipedia_bmi]: +//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_. +//! 28Advanced_Bit_Manipulation.29 #[cfg(test)] use stdsimd_test::assert_instr; -// TODO: LLVM-CODEGEN ERROR: LLVM ERROR: Cannot select: intrinsic %llvm.x86.tbm.bextri.u32 +// TODO: LLVM-CODEGEN ERROR: LLVM ERROR: Cannot select: +// intrinsic %llvm.x86.tbm.bextri.u32 /* #[allow(dead_code)] extern "C" { @@ -39,8 +44,8 @@ pub fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 { /// Extracts bits of `a` specified by `control` into /// the least significant bits of the result. /// -/// Bits [7,0] of `control` specify the index to the first bit in the range to be -/// extracted, and bits [15,8] specify the length of the range. +/// Bits [7,0] of `control` specify the index to the first bit in the range to +/// be extracted, and bits [15,8] specify the length of the range. #[inline(always)] #[target_feature = "+tbm"] pub fn _bextr2_u32(a: u32, control: u32) -> u32 { @@ -50,8 +55,8 @@ pub fn _bextr2_u32(a: u32, control: u32) -> u32 { /// Extracts bits of `a` specified by `control` into /// the least significant bits of the result. /// -/// Bits [7,0] of `control` specify the index to the first bit in the range to be -/// extracted, and bits [15,8] specify the length of the range. +/// Bits [7,0] of `control` specify the index to the first bit in the range to +/// be extracted, and bits [15,8] specify the length of the range. #[inline(always)] #[target_feature = "+tbm"] pub fn _bextr2_u64(a: u64, control: u64) -> u64 { @@ -122,7 +127,8 @@ pub unsafe fn _blcic_u64(x: u64) -> u64 { !x & (x.wrapping_add(1)) } -/// Sets the least significant zero bit of `x` and clears all bits above that bit. +/// Sets the least significant zero bit of `x` and clears all bits above +/// that bit. /// /// If there is no zero bit in `x`, it sets all the bits. #[inline(always)] @@ -132,7 +138,8 @@ pub unsafe fn _blcmsk_u32(x: u32) -> u32 { x ^ (x.wrapping_add(1)) } -/// Sets the least significant zero bit of `x` and clears all bits above that bit. +/// Sets the least significant zero bit of `x` and clears all bits above +/// that bit. /// /// If there is no zero bit in `x`, it sets all the bits. #[inline(always)] @@ -272,162 +279,152 @@ mod tests { #[simd_test = "tbm"] unsafe fn _blcfill_u32() { - assert_eq!( - tbm::_blcfill_u32(0b0101_0111u32), - 0b0101_0000u32); - assert_eq!( - tbm::_blcfill_u32(0b1111_1111u32), - 0u32); + assert_eq!(tbm::_blcfill_u32(0b0101_0111u32), 0b0101_0000u32); + assert_eq!(tbm::_blcfill_u32(0b1111_1111u32), 0u32); } #[simd_test = "tbm"] #[cfg(not(target_arch = "x86"))] unsafe fn _blcfill_u64() { - assert_eq!( - tbm::_blcfill_u64(0b0101_0111u64), - 0b0101_0000u64); - assert_eq!( - tbm::_blcfill_u64(0b1111_1111u64), - 0u64); + assert_eq!(tbm::_blcfill_u64(0b0101_0111u64), 0b0101_0000u64); + assert_eq!(tbm::_blcfill_u64(0b1111_1111u64), 0u64); } #[simd_test = "tbm"] unsafe fn _blci_u32() { assert_eq!( tbm::_blci_u32(0b0101_0000u32), - 0b1111_1111_1111_1111_1111_1111_1111_1110u32); + 0b1111_1111_1111_1111_1111_1111_1111_1110u32 + ); assert_eq!( tbm::_blci_u32(0b1111_1111u32), - 0b1111_1111_1111_1111_1111_1110_1111_1111u32); + 0b1111_1111_1111_1111_1111_1110_1111_1111u32 + ); } #[simd_test = "tbm"] #[cfg(not(target_arch = "x86"))] + #[cfg_attr(rustfmt, rustfmt_skip)] unsafe fn _blci_u64() { assert_eq!( tbm::_blci_u64(0b0101_0000u64), - 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110u64); + 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110u64 + ); assert_eq!( tbm::_blci_u64(0b1111_1111u64), - 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110_1111_1111u64); + 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110_1111_1111u64 + ); } #[simd_test = "tbm"] unsafe fn _blcic_u32() { - assert_eq!( - tbm::_blcic_u32(0b0101_0001u32), - 0b0000_0010u32); - assert_eq!( - tbm::_blcic_u32(0b1111_1111u32), - 0b1_0000_0000u32); + assert_eq!(tbm::_blcic_u32(0b0101_0001u32), 0b0000_0010u32); + assert_eq!(tbm::_blcic_u32(0b1111_1111u32), 0b1_0000_0000u32); } #[simd_test = "tbm"] #[cfg(not(target_arch = "x86"))] unsafe fn _blcic_u64() { - assert_eq!( - tbm::_blcic_u64(0b0101_0001u64), - 0b0000_0010u64); - assert_eq!( - tbm::_blcic_u64(0b1111_1111u64), - 0b1_0000_0000u64); + assert_eq!(tbm::_blcic_u64(0b0101_0001u64), 0b0000_0010u64); + assert_eq!(tbm::_blcic_u64(0b1111_1111u64), 0b1_0000_0000u64); } #[simd_test = "tbm"] unsafe fn _blcmsk_u32() { - assert_eq!( - tbm::_blcmsk_u32(0b0101_0001u32), - 0b0000_0011u32); - assert_eq!( - tbm::_blcmsk_u32(0b1111_1111u32), - 0b1_1111_1111u32); + assert_eq!(tbm::_blcmsk_u32(0b0101_0001u32), 0b0000_0011u32); + assert_eq!(tbm::_blcmsk_u32(0b1111_1111u32), 0b1_1111_1111u32); } #[simd_test = "tbm"] #[cfg(not(target_arch = "x86"))] unsafe fn _blcmsk_u64() { - assert_eq!( - tbm::_blcmsk_u64(0b0101_0001u64), - 0b0000_0011u64); - assert_eq!( - tbm::_blcmsk_u64(0b1111_1111u64), - 0b1_1111_1111u64); + assert_eq!(tbm::_blcmsk_u64(0b0101_0001u64), 0b0000_0011u64); + assert_eq!(tbm::_blcmsk_u64(0b1111_1111u64), 0b1_1111_1111u64); } #[simd_test = "tbm"] unsafe fn _blcs_u32() { - assert_eq!(tbm::_blcs_u32(0b0101_0001u32), 0b0101_0011u32); - assert_eq!(tbm::_blcs_u32(0b1111_1111u32), 0b1_1111_1111u32); + assert_eq!(tbm::_blcs_u32(0b0101_0001u32), 0b0101_0011u32); + assert_eq!(tbm::_blcs_u32(0b1111_1111u32), 0b1_1111_1111u32); } #[simd_test = "tbm"] #[cfg(not(target_arch = "x86"))] unsafe fn _blcs_u64() { - assert_eq!(tbm::_blcs_u64(0b0101_0001u64), 0b0101_0011u64); - assert_eq!(tbm::_blcs_u64(0b1111_1111u64), 0b1_1111_1111u64); + assert_eq!(tbm::_blcs_u64(0b0101_0001u64), 0b0101_0011u64); + assert_eq!(tbm::_blcs_u64(0b1111_1111u64), 0b1_1111_1111u64); } #[simd_test = "tbm"] unsafe fn _blsfill_u32() { - assert_eq!( - tbm::_blsfill_u32(0b0101_0100u32), - 0b0101_0111u32); + assert_eq!(tbm::_blsfill_u32(0b0101_0100u32), 0b0101_0111u32); assert_eq!( tbm::_blsfill_u32(0u32), - 0b1111_1111_1111_1111_1111_1111_1111_1111u32); + 0b1111_1111_1111_1111_1111_1111_1111_1111u32 + ); } #[simd_test = "tbm"] #[cfg(not(target_arch = "x86"))] + #[cfg_attr(rustfmt, rustfmt_skip)] unsafe fn _blsfill_u64() { - assert_eq!( - tbm::_blsfill_u64(0b0101_0100u64), - 0b0101_0111u64); + assert_eq!(tbm::_blsfill_u64(0b0101_0100u64), 0b0101_0111u64); assert_eq!( tbm::_blsfill_u64(0u64), - 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64); + 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64 + ); } #[simd_test = "tbm"] unsafe fn _blsic_u32() { assert_eq!( tbm::_blsic_u32(0b0101_0100u32), - 0b1111_1111_1111_1111_1111_1111_1111_1011u32); + 0b1111_1111_1111_1111_1111_1111_1111_1011u32 + ); assert_eq!( tbm::_blsic_u32(0u32), - 0b1111_1111_1111_1111_1111_1111_1111_1111u32); + 0b1111_1111_1111_1111_1111_1111_1111_1111u32 + ); } #[simd_test = "tbm"] #[cfg(not(target_arch = "x86"))] + #[cfg_attr(rustfmt, rustfmt_skip)] unsafe fn _blsic_u64() { assert_eq!( tbm::_blsic_u64(0b0101_0100u64), - 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1011u64); - assert_eq!( - tbm::_blsic_u64(0u64), - 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64); + 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1011u64 + ); + assert_eq!( + tbm::_blsic_u64(0u64), + 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64 + ); } #[simd_test = "tbm"] unsafe fn _t1mskc_u32() { - assert_eq!( - tbm::_t1mskc_u32(0b0101_0111u32), - 0b1111_1111_1111_1111_1111_1111_1111_1000u32); - assert_eq!( - tbm::_t1mskc_u32(0u32), - 0b1111_1111_1111_1111_1111_1111_1111_1111u32); + assert_eq!( + tbm::_t1mskc_u32(0b0101_0111u32), + 0b1111_1111_1111_1111_1111_1111_1111_1000u32 + ); + assert_eq!( + tbm::_t1mskc_u32(0u32), + 0b1111_1111_1111_1111_1111_1111_1111_1111u32 + ); } #[simd_test = "tbm"] #[cfg(not(target_arch = "x86"))] + #[cfg_attr(rustfmt, rustfmt_skip)] unsafe fn _t1mksc_u64() { - assert_eq!( - tbm::_t1mskc_u64(0b0101_0111u64), - 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1000u64); - assert_eq!( - tbm::_t1mskc_u64(0u64), - 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64); + assert_eq!( + tbm::_t1mskc_u64(0b0101_0111u64), + 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1000u64 + ); + assert_eq!( + tbm::_t1mskc_u64(0u64), + 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64 + ); } #[simd_test = "tbm"] diff --git a/library/stdarch/stdsimd-test/assert-instr-macro/build.rs b/library/stdarch/stdsimd-test/assert-instr-macro/build.rs index dc42e265b737..45a868441c4a 100644 --- a/library/stdarch/stdsimd-test/assert-instr-macro/build.rs +++ b/library/stdarch/stdsimd-test/assert-instr-macro/build.rs @@ -2,7 +2,10 @@ use std::env; fn main() { println!("cargo:rerun-if-changed=build.rs"); - let opt_level = env::var("OPT_LEVEL").ok().and_then(|s| s.parse().ok()).unwrap_or(0); + let opt_level = env::var("OPT_LEVEL") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(0); let profile = env::var("PROFILE").unwrap_or(String::new()); if profile == "release" || opt_level >= 2 { println!("cargo:rustc-cfg=optimized"); diff --git a/library/stdarch/stdsimd-test/assert-instr-macro/src/lib.rs b/library/stdarch/stdsimd-test/assert-instr-macro/src/lib.rs index 22f5dcef24ea..9e5df683e9a3 100644 --- a/library/stdarch/stdsimd-test/assert-instr-macro/src/lib.rs +++ b/library/stdarch/stdsimd-test/assert-instr-macro/src/lib.rs @@ -21,13 +21,13 @@ extern crate synom; use proc_macro2::TokenStream; #[proc_macro_attribute] -pub fn assert_instr(attr: proc_macro::TokenStream, - item: proc_macro::TokenStream) - -> proc_macro::TokenStream -{ +pub fn assert_instr( + attr: proc_macro::TokenStream, item: proc_macro::TokenStream +) -> proc_macro::TokenStream { let invoc = syn::parse::(attr) .expect("expected #[assert_instr(instr, a = b, ...)]"); - let item = syn::parse::(item).expect("must be attached to an item"); + let item = + syn::parse::(item).expect("must be attached to an item"); let func = match item.node { syn::ItemKind::Fn(ref f) => f, _ => panic!("must be attached to a function"), @@ -40,10 +40,11 @@ pub fn assert_instr(attr: proc_macro::TokenStream, (quote! { #[ignore] }).into() }; let name = &func.ident; - let assert_name = syn::Ident::from(&format!("assert_{}_{}", - name.sym.as_str(), - instr.sym.as_str())[..]); - let shim_name = syn::Ident::from(&format!("{}_shim", name.sym.as_str())[..]); + let assert_name = syn::Ident::from( + &format!("assert_{}_{}", name.sym.as_str(), instr.sym.as_str())[..], + ); + let shim_name = + syn::Ident::from(&format!("{}_shim", name.sym.as_str())[..]); let (to_test, test_name) = if invoc.args.len() == 0 { (TokenStream::empty(), &func.ident) } else { @@ -69,16 +70,29 @@ pub fn assert_instr(attr: proc_macro::TokenStream, } }; } - let attrs = item.attrs.iter().filter(|attr| { - attr.path.segments.get(0).item().ident.sym.as_str().starts_with("target") - }).collect::>(); + let attrs = item.attrs + .iter() + .filter(|attr| { + attr.path + .segments + .get(0) + .item() + .ident + .sym + .as_str() + .starts_with("target") + }) + .collect::>(); let attrs = Append(&attrs); - (quote! { - #attrs - unsafe fn #shim_name(#(#inputs),*) #ret { - #name(#(#input_vals),*) - } - }.into(), &shim_name) + ( + quote! { + #attrs + unsafe fn #shim_name(#(#inputs),*) #ret { + #name(#(#input_vals),*) + } + }.into(), + &shim_name, + ) }; let tts: TokenStream = quote! { @@ -128,8 +142,9 @@ impl synom::Synom for Invoc { struct Append(T); impl quote::ToTokens for Append - where T: Clone + IntoIterator, - T::Item: quote::ToTokens +where + T: Clone + IntoIterator, + T::Item: quote::ToTokens, { fn to_tokens(&self, tokens: &mut quote::Tokens) { for item in self.0.clone() { diff --git a/library/stdarch/stdsimd-test/simd-test-macro/src/lib.rs b/library/stdarch/stdsimd-test/simd-test-macro/src/lib.rs index 1543a2767710..3777feae798e 100644 --- a/library/stdarch/stdsimd-test/simd-test-macro/src/lib.rs +++ b/library/stdarch/stdsimd-test/simd-test-macro/src/lib.rs @@ -1,16 +1,16 @@ //! Implementation of the `#[simd_test]` macro //! -//! This macro expands to a `#[test]` function which tests the local machine for -//! the appropriate cfg before calling the inner test function. +//! This macro expands to a `#[test]` function which tests the local machine +//! for the appropriate cfg before calling the inner test function. #![feature(proc_macro)] +extern crate proc_macro2; +extern crate proc_macro; #[macro_use] extern crate quote; -extern crate proc_macro; -extern crate proc_macro2; -use proc_macro2::{TokenStream, Term, TokenNode, TokenTree}; +use proc_macro2::{Term, TokenNode, TokenStream, TokenTree}; use proc_macro2::Literal; fn string(s: &str) -> TokenTree { @@ -22,8 +22,9 @@ fn string(s: &str) -> TokenTree { } #[proc_macro_attribute] -pub fn simd_test(attr: proc_macro::TokenStream, - item: proc_macro::TokenStream) -> proc_macro::TokenStream { +pub fn simd_test( + attr: proc_macro::TokenStream, item: proc_macro::TokenStream +) -> proc_macro::TokenStream { let tokens = TokenStream::from(attr).into_iter().collect::>(); if tokens.len() != 2 { panic!("expected #[simd_test = \"feature\"]"); @@ -37,8 +38,9 @@ pub fn simd_test(attr: proc_macro::TokenStream, TokenNode::Literal(ref l) => l.to_string(), _ => panic!("expected #[simd_test = \"feature\"]"), }; - let enable_feature = enable_feature.trim_left_matches('"') - .trim_right_matches('"'); + let enable_feature = enable_feature + .trim_left_matches('"') + .trim_right_matches('"'); let enable_feature = string(&format!("+{}", enable_feature)); let item = TokenStream::from(item); let name = find_name(item.clone()); @@ -67,7 +69,7 @@ fn find_name(item: TokenStream) -> Term { while let Some(tok) = tokens.next() { if let TokenNode::Term(word) = tok.kind { if word.as_str() == "fn" { - break + break; } } } diff --git a/library/stdarch/stdsimd-test/src/lib.rs b/library/stdarch/stdsimd-test/src/lib.rs index 8bad780acb14..5de401695f61 100644 --- a/library/stdarch/stdsimd-test/src/lib.rs +++ b/library/stdarch/stdsimd-test/src/lib.rs @@ -7,12 +7,12 @@ #![feature(proc_macro)] extern crate assert_instr_macro; -extern crate simd_test_macro; extern crate backtrace; extern crate cc; -extern crate rustc_demangle; #[macro_use] extern crate lazy_static; +extern crate rustc_demangle; +extern crate simd_test_macro; use std::collections::HashMap; use std::env; @@ -23,7 +23,8 @@ pub use assert_instr_macro::*; pub use simd_test_macro::*; lazy_static! { - static ref DISASSEMBLY: HashMap> = disassemble_myself(); + static ref DISASSEMBLY: HashMap> + = disassemble_myself(); } struct Function { @@ -37,14 +38,22 @@ struct Instruction { fn disassemble_myself() -> HashMap> { let me = env::current_exe().expect("failed to get current exe"); - if cfg!(target_arch = "x86_64") && - cfg!(target_os = "windows") && - cfg!(target_env = "msvc") { - let mut cmd = cc::windows_registry::find("x86_64-pc-windows-msvc", "dumpbin.exe") - .expect("failed to find `dumpbin` tool"); - let output = cmd.arg("/DISASM").arg(&me).output() + if cfg!(target_arch = "x86_64") && cfg!(target_os = "windows") + && cfg!(target_env = "msvc") + { + let mut cmd = cc::windows_registry::find( + "x86_64-pc-windows-msvc", + "dumpbin.exe", + ).expect("failed to find `dumpbin` tool"); + let output = cmd.arg("/DISASM") + .arg(&me) + .output() .expect("failed to execute dumpbin"); - println!("{}\n{}", output.status, String::from_utf8_lossy(&output.stderr)); + println!( + "{}\n{}", + output.status, + String::from_utf8_lossy(&output.stderr) + ); assert!(output.status.success()); parse_dumpbin(&String::from_utf8_lossy(&output.stdout)) } else if cfg!(target_os = "windows") { @@ -55,7 +64,11 @@ fn disassemble_myself() -> HashMap> { .arg(&me) .output() .expect("failed to execute otool"); - println!("{}\n{}", output.status, String::from_utf8_lossy(&output.stderr)); + println!( + "{}\n{}", + output.status, + String::from_utf8_lossy(&output.stderr) + ); assert!(output.status.success()); parse_otool(&str::from_utf8(&output.stdout).expect("stdout not utf8")) @@ -66,10 +79,16 @@ fn disassemble_myself() -> HashMap> { .arg(&me) .output() .expect("failed to execute objdump"); - println!("{}\n{}", output.status, String::from_utf8_lossy(&output.stderr)); + println!( + "{}\n{}", + output.status, + String::from_utf8_lossy(&output.stderr) + ); assert!(output.status.success()); - parse_objdump(&str::from_utf8(&output.stdout).expect("stdout not utf8")) + parse_objdump( + &str::from_utf8(&output.stdout).expect("stdout not utf8"), + ) } } @@ -91,7 +110,7 @@ fn parse_objdump(output: &str) -> HashMap> { while let Some(header) = lines.next() { // symbols should start with `$hex_addr <$name>:` if !header.ends_with(">:") { - continue + continue; } let start = header.find("<").unwrap(); let symbol = &header[start + 1..header.len() - 2]; @@ -99,15 +118,17 @@ fn parse_objdump(output: &str) -> HashMap> { let mut instructions = Vec::new(); while let Some(instruction) = lines.next() { if instruction.is_empty() { - break + break; } // Each line of instructions should look like: // // $rel_offset: ab cd ef 00 $instruction... - let parts = instruction.split_whitespace() + let parts = instruction + .split_whitespace() .skip(1) .skip_while(|s| { - s.len() == expected_len && usize::from_str_radix(s, 16).is_ok() + s.len() == expected_len + && usize::from_str_radix(s, 16).is_ok() }) .map(|s| s.to_string()) .collect::>(); @@ -116,10 +137,12 @@ fn parse_objdump(output: &str) -> HashMap> { ret.entry(normalize(symbol)) .or_insert(Vec::new()) - .push(Function { instrs: instructions }); + .push(Function { + instrs: instructions, + }); } - return ret + return ret; } fn parse_otool(output: &str) -> HashMap> { @@ -138,7 +161,7 @@ fn parse_otool(output: &str) -> HashMap> { }; // symbols should start with `$symbol:` if !header.ends_with(":") { - continue + continue; } // strip the leading underscore and the trailing colon let symbol = &header[1..header.len() - 1]; @@ -147,12 +170,13 @@ fn parse_otool(output: &str) -> HashMap> { while let Some(instruction) = lines.next() { if instruction.ends_with(":") { cached_header = Some(instruction); - break + break; } // Each line of instructions should look like: // // $addr $instruction... - let parts = instruction.split_whitespace() + let parts = instruction + .split_whitespace() .skip(1) .map(|s| s.to_string()) .collect::>(); @@ -161,10 +185,12 @@ fn parse_otool(output: &str) -> HashMap> { ret.entry(normalize(symbol)) .or_insert(Vec::new()) - .push(Function { instrs: instructions }); + .push(Function { + instrs: instructions, + }); } - return ret + return ret; } fn parse_dumpbin(output: &str) -> HashMap> { @@ -183,7 +209,7 @@ fn parse_dumpbin(output: &str) -> HashMap> { }; // symbols should start with `$symbol:` if !header.ends_with(":") { - continue + continue; } // strip the trailing colon let symbol = &header[..header.len() - 1]; @@ -192,20 +218,21 @@ fn parse_dumpbin(output: &str) -> HashMap> { while let Some(instruction) = lines.next() { if !instruction.starts_with(" ") { cached_header = Some(instruction); - break + break; } // Each line looks like: // // > $addr: ab cd ef $instr.. // > 00 12 # this line os optional if instruction.starts_with(" ") { - continue + continue; } - let parts = instruction.split_whitespace() + let parts = instruction + .split_whitespace() .skip(1) - .skip_while(|s| { - s.len() == 2 && usize::from_str_radix(s, 16).is_ok() - }) + .skip_while( + |s| s.len() == 2 && usize::from_str_radix(s, 16).is_ok(), + ) .map(|s| s.to_string()) .collect::>(); instructions.push(Instruction { parts }); @@ -213,10 +240,12 @@ fn parse_dumpbin(output: &str) -> HashMap> { ret.entry(normalize(symbol)) .or_insert(Vec::new()) - .push(Function { instrs: instructions }); + .push(Function { + instrs: instructions, + }); } - return ret + return ret; } fn normalize(symbol: &str) -> String { @@ -266,7 +295,7 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) { // instruction: tzcntl => tzcnt and compares that. if part.starts_with(expected) { found = true; - break + break; } } } @@ -274,7 +303,7 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) { let probably_only_one_instruction = function.instrs.len() < 30; if found && probably_only_one_instruction { - return + return; } // Help debug by printing out the found disassembly, and then panic as we diff --git a/library/stdarch/tests/cpu-detection.rs b/library/stdarch/tests/cpu-detection.rs index 1a49a8762b33..36cf7c97aa14 100644 --- a/library/stdarch/tests/cpu-detection.rs +++ b/library/stdarch/tests/cpu-detection.rs @@ -1,10 +1,10 @@ #![cfg_attr(feature = "strict", deny(warnings))] #![feature(cfg_target_feature)] +extern crate cupid; #[macro_use] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] extern crate stdsimd; -extern crate cupid; #[test] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]