diff --git a/library/stdarch/.appveyor.yml b/library/stdarch/.appveyor.yml new file mode 100644 index 000000000000..bd02240ad106 --- /dev/null +++ b/library/stdarch/.appveyor.yml @@ -0,0 +1,24 @@ +environment: + # We don't want to do identical comdat folding as it messes up the ability to + # generate lossless backtraces in some cases. This is enabled by rustc by + # default so pass a flag to disable it to ensure our tests work ok. + RUSTFLAGS: -Clink-args=/OPT:NOICF + + matrix: + - TARGET: x86_64-pc-windows-msvc + +install: + # Install rust, x86_64-pc-windows-msvc host + - appveyor-retry appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe + - rustup-init.exe -y --default-host x86_64-pc-windows-msvc --default-toolchain nightly + - set PATH=%PATH%;C:\Users\appveyor\.cargo\bin + - if NOT "%TARGET%" == "x86_64-pc-windows-msvc" rustup target add %TARGET% + - rustc -vV + - cargo -vV + +build: false + +test_script: + - cargo test --target %TARGET% + - set RUST_BACKTRACE=1 + - cargo test --target %TARGET% --release diff --git a/library/stdarch/.travis.yml b/library/stdarch/.travis.yml new file mode 100644 index 000000000000..12638698cb44 --- /dev/null +++ b/library/stdarch/.travis.yml @@ -0,0 +1,16 @@ +language: rust +sudo: false + +matrix: + include: + - rust: nightly + - rust: nightly + os: osx + +script: + - cargo test + - cargo test --release + +notifications: + email: + on_success: never diff --git a/library/stdarch/.vscode/temp.sql b/library/stdarch/.vscode/temp.sql new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/library/stdarch/CONTRIBUTING.md b/library/stdarch/CONTRIBUTING.md index a99eaa5eedf5..8de3f8466080 100644 --- a/library/stdarch/CONTRIBUTING.md +++ b/library/stdarch/CONTRIBUTING.md @@ -14,6 +14,7 @@ example for `_mm_adds_epi16`: /// Add packed 16-bit integers in `a` and `b` using saturation. #[inline(always)] #[target_feature = "+sse2"] +#[cfg_attr(test, assert_instr(paddsw))] pub fn _mm_adds_epi16(a: i16x8, b: i16x8) -> i16x8 { unsafe { paddsw(a, b) } } @@ -32,6 +33,10 @@ Let's break this down: support `sse2`, the compiler will still generate code for `_mm_adds_epi16` *as if* `sse2` support existed. Without this attribute, the compiler might not generate the intended CPU instruction. +* The `#[cfg_attr(test, assert_instr(paddsw))]` attribute indicates that when + we're testing the crate we'll assert that the `paddsw` instruction is + generated inside this function, ensuring that the SIMD intrinsic truly is an + intrinsic for the instruction! * The types of the vectors given to the intrinsic should generally match the types as provided in the vendor interface. We'll talk about this more below. * The implementation of the vendor intrinsic is generally very simple. @@ -40,7 +45,7 @@ Let's break this down: compiler intrinsic (in this case, `paddsw`) when one is available. More on this below as well. -Once a function has been added, you should add at least one test for basic +Once a function has been added, you should also add at least one test for basic functionality. Here's an example for `_mm_adds_epi16`: ```rust diff --git a/library/stdarch/Cargo.toml b/library/stdarch/Cargo.toml index 9a6e7c95d704..87cd5dd14ca4 100644 --- a/library/stdarch/Cargo.toml +++ b/library/stdarch/Cargo.toml @@ -13,3 +13,10 @@ license = "MIT" [profile.release] debug = true opt-level = 3 + +[profile.bench] +debug = true +opt-level = 3 + +[dev-dependencies] +assert-instr = { path = "assert-instr" } diff --git a/library/stdarch/TODO.md b/library/stdarch/TODO.md index 12832620bf33..6b69e250da84 100644 --- a/library/stdarch/TODO.md +++ b/library/stdarch/TODO.md @@ -155,7 +155,7 @@ sse * [ ] `_mm_storer_ps` * [ ] `_mm_move_ss` * [ ] `_mm_shuffle_ps` -* [ ] `_mm_unpackhi_ps` +* [x] `_mm_unpackhi_ps` * [ ] `_mm_unpacklo_ps` * [ ] `_mm_movehl_ps` * [ ] `_mm_movelh_ps` diff --git a/library/stdarch/assert-instr/Cargo.toml b/library/stdarch/assert-instr/Cargo.toml new file mode 100644 index 000000000000..fda3e32c7682 --- /dev/null +++ b/library/stdarch/assert-instr/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "assert-instr" +version = "0.1.0" +authors = ["Alex Crichton "] + +[dependencies] +assert-instr-macro = { path = "assert-instr-macro" } +backtrace = "0.3" +cc = "1.0" +lazy_static = "0.2" +rustc-demangle = "0.1" diff --git a/library/stdarch/assert-instr/assert-instr-macro/Cargo.toml b/library/stdarch/assert-instr/assert-instr-macro/Cargo.toml new file mode 100644 index 000000000000..367f4b5e94ba --- /dev/null +++ b/library/stdarch/assert-instr/assert-instr-macro/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "assert-instr-macro" +version = "0.1.0" +authors = ["Alex Crichton "] + +[lib] +proc-macro = true diff --git a/library/stdarch/assert-instr/assert-instr-macro/build.rs b/library/stdarch/assert-instr/assert-instr-macro/build.rs new file mode 100644 index 000000000000..dc42e265b737 --- /dev/null +++ b/library/stdarch/assert-instr/assert-instr-macro/build.rs @@ -0,0 +1,10 @@ +use std::env; + +fn main() { + println!("cargo:rerun-if-changed=build.rs"); + let opt_level = env::var("OPT_LEVEL").ok().and_then(|s| s.parse().ok()).unwrap_or(0); + let profile = env::var("PROFILE").unwrap_or(String::new()); + if profile == "release" || opt_level >= 2 { + println!("cargo:rustc-cfg=optimized"); + } +} diff --git a/library/stdarch/assert-instr/assert-instr-macro/src/lib.rs b/library/stdarch/assert-instr/assert-instr-macro/src/lib.rs new file mode 100644 index 000000000000..9d7093a52322 --- /dev/null +++ b/library/stdarch/assert-instr/assert-instr-macro/src/lib.rs @@ -0,0 +1,71 @@ +//! Implementation of the `#[assert_instr]` macro +//! +//! This macro is used when testing the `stdsimd` crate and is used to generate +//! test cases to assert that functions do indeed contain the instructions that +//! we're expecting them to contain. +//! +//! The procedural macro here is relatively simple, it simply appends a +//! `#[test]` function to the original token stream which asserts that the +//! function itself contains the relevant instruction. + +#![feature(proc_macro)] + +extern crate proc_macro; + +use proc_macro::{TokenStream, Term, TokenNode, Delimiter}; + +#[proc_macro_attribute] +pub fn assert_instr(attr: TokenStream, item: TokenStream) -> TokenStream { + let name = find_name(item.clone()); + let tokens = attr.into_iter().collect::>(); + if tokens.len() != 1 { + panic!("expected #[assert_instr(foo)]"); + } + let tokens = match tokens[0].kind { + TokenNode::Group(Delimiter::Parenthesis, ref rest) => rest.clone(), + _ => panic!("expected #[assert_instr(foo)]"), + }; + let tokens = tokens.into_iter().collect::>(); + if tokens.len() != 1 { + panic!("expected #[assert_instr(foo)]"); + } + let instr = match tokens[0].kind { + TokenNode::Term(term) => term, + _ => panic!("expected #[assert_instr(foo)]"), + }; + + let ignore = if cfg!(optimized) { + "" + } else { + "#[ignore]" + }; + let test = format!(" + #[test] + #[allow(non_snake_case)] + {ignore} + fn assert_instr_{name}() {{ + ::assert_instr::assert({name} as usize, + \"{name}\", + \"{instr}\"); + }} + ", name = name.as_str(), instr = instr.as_str(), ignore = ignore); + let test: TokenStream = test.parse().unwrap(); + + item.into_iter().chain(test.into_iter()).collect() +} + +fn find_name(item: TokenStream) -> Term { + let mut tokens = item.into_iter(); + while let Some(tok) = tokens.next() { + if let TokenNode::Term(word) = tok.kind { + if word.as_str() == "fn" { + break + } + } + } + + match tokens.next().map(|t| t.kind) { + Some(TokenNode::Term(word)) => word, + _ => panic!("failed to find function name"), + } +} diff --git a/library/stdarch/assert-instr/src/lib.rs b/library/stdarch/assert-instr/src/lib.rs new file mode 100644 index 000000000000..ada7b8bc3fa0 --- /dev/null +++ b/library/stdarch/assert-instr/src/lib.rs @@ -0,0 +1,273 @@ +//! Runtime support needed for the `#![assert_instr]` macro +//! +//! This basically just disassembles the current executable and then parses the +//! output once globally and then provides the `assert` function which makes +//! assertions about the disassembly of a function. + +#![feature(proc_macro)] + +extern crate assert_instr_macro; +extern crate backtrace; +extern crate cc; +extern crate rustc_demangle; +#[macro_use] +extern crate lazy_static; + +use std::collections::HashMap; +use std::env; +use std::process::Command; +use std::str; + +pub use assert_instr_macro::*; + +lazy_static! { + static ref DISASSEMBLY: HashMap> = disassemble_myself(); +} + +struct Function { + instrs: Vec, +} + +struct Instruction { + parts: Vec, +} + +fn disassemble_myself() -> HashMap> { + let me = env::current_exe().expect("failed to get current exe"); + + if cfg!(target_arch = "x86_64") && + cfg!(target_os = "windows") && + cfg!(target_env = "msvc") { + let mut cmd = cc::windows_registry::find("x86_64-pc-windows-msvc", "dumpbin.exe") + .expect("failed to find `dumpbin` tool"); + let output = cmd.arg("/DISASM").arg(&me).output() + .expect("failed to execute dumpbin"); + println!("{}\n{}", output.status, String::from_utf8_lossy(&output.stderr)); + assert!(output.status.success()); + parse_dumpbin(&String::from_utf8_lossy(&output.stdout)) + } else if cfg!(target_os = "windows") { + panic!("disassembly unimplemented") + } else if cfg!(target_os = "macos") { + let output = Command::new("otool") + .arg("-vt") + .arg(&me) + .output() + .expect("failed to execute otool"); + println!("{}\n{}", output.status, String::from_utf8_lossy(&output.stderr)); + assert!(output.status.success()); + + parse_otool(&str::from_utf8(&output.stdout).expect("stdout not utf8")) + } else { + let output = Command::new("objdump") + .arg("--disassemble") + .arg(&me) + .output() + .expect("failed to execute objdump"); + println!("{}\n{}", output.status, String::from_utf8_lossy(&output.stderr)); + assert!(output.status.success()); + + parse_objdump(&str::from_utf8(&output.stdout).expect("stdout not utf8")) + } +} + +fn parse_objdump(output: &str) -> HashMap> { + let mut lines = output.lines(); + + for line in output.lines().take(100) { + println!("{}", line); + } + + let mut ret = HashMap::new(); + while let Some(header) = lines.next() { + // symbols should start with `$hex_addr <$name>:` + if !header.ends_with(">:") { + continue + } + let start = header.find("<").unwrap(); + let symbol = &header[start + 1..header.len() - 2]; + + let mut instructions = Vec::new(); + while let Some(instruction) = lines.next() { + if instruction.is_empty() { + break + } + // Each line of instructions should look like: + // + // $rel_offset: ab cd ef 00 $instruction... + let parts = instruction.split_whitespace() + .skip(1) + .skip_while(|s| { + s.len() == 2 && usize::from_str_radix(s, 16).is_ok() + }) + .map(|s| s.to_string()) + .collect::>(); + instructions.push(Instruction { parts }); + } + + ret.entry(normalize(symbol)) + .or_insert(Vec::new()) + .push(Function { instrs: instructions }); + } + + return ret +} + +fn parse_otool(output: &str) -> HashMap> { + let mut lines = output.lines(); + + for line in output.lines().take(100) { + println!("{}", line); + } + + let mut ret = HashMap::new(); + let mut cached_header = None; + loop { + let header = match cached_header.take().or_else(|| lines.next()) { + Some(header) => header, + None => break, + }; + // symbols should start with `$symbol:` + if !header.ends_with(":") { + continue + } + // strip the leading underscore and the trailing colon + let symbol = &header[1..header.len() - 1]; + + let mut instructions = Vec::new(); + while let Some(instruction) = lines.next() { + if instruction.ends_with(":") { + cached_header = Some(instruction); + break + } + // Each line of instructions should look like: + // + // $addr $instruction... + let parts = instruction.split_whitespace() + .skip(1) + .map(|s| s.to_string()) + .collect::>(); + instructions.push(Instruction { parts }); + } + + ret.entry(normalize(symbol)) + .or_insert(Vec::new()) + .push(Function { instrs: instructions }); + } + + return ret +} + +fn parse_dumpbin(output: &str) -> HashMap> { + let mut lines = output.lines(); + + for line in output.lines().take(100) { + println!("{}", line); + } + + let mut ret = HashMap::new(); + let mut cached_header = None; + loop { + let header = match cached_header.take().or_else(|| lines.next()) { + Some(header) => header, + None => break, + }; + // symbols should start with `$symbol:` + if !header.ends_with(":") { + continue + } + // strip the trailing colon + let symbol = &header[..header.len() - 1]; + + let mut instructions = Vec::new(); + while let Some(instruction) = lines.next() { + if !instruction.starts_with(" ") { + cached_header = Some(instruction); + break + } + // Each line looks like: + // + // > $addr: ab cd ef $instr.. + // > 00 12 # this line os optional + if instruction.starts_with(" ") { + continue + } + let parts = instruction.split_whitespace() + .skip(1) + .skip_while(|s| { + s.len() == 2 && usize::from_str_radix(s, 16).is_ok() + }) + .map(|s| s.to_string()) + .collect::>(); + instructions.push(Instruction { parts }); + } + + ret.entry(normalize(symbol)) + .or_insert(Vec::new()) + .push(Function { instrs: instructions }); + } + + return ret +} + +fn normalize(symbol: &str) -> String { + let symbol = rustc_demangle::demangle(symbol).to_string(); + match symbol.rfind("::h") { + Some(i) => symbol[..i].to_string(), + None => symbol.to_string(), + } +} + +/// Main entry point for this crate, called by the `#[assert_instr]` macro. +/// +/// This asserts that the function at `fnptr` contains the instruction +/// `expected` provided. +pub fn assert(fnptr: usize, fnname: &str, expected: &str) { + // Translate this function pointer to a symbolic name that we'd have found + // in the disassembly. + let mut sym = None; + backtrace::resolve(fnptr as *mut _, |name| { + sym = name.name().and_then(|s| s.as_str()).map(normalize); + }); + + let functions = match sym.as_ref().and_then(|s| DISASSEMBLY.get(s)) { + Some(s) => s, + None => { + if let Some(sym) = sym { + println!("assumed symbol name: `{}`", sym); + } + println!("maybe related functions"); + for f in DISASSEMBLY.keys().filter(|k| k.contains(fnname)) { + println!("\t- {}", f); + } + panic!("failed to find disassembly of {:#x} ({})", fnptr, fnname); + } + }; + + assert_eq!(functions.len(), 1); + let function = &functions[0]; + + // Look for `expected` as the first part of any instruction in this + // function, returning if we do indeed find it. + for instr in function.instrs.iter() { + // Gets the first instruction, e.g. tzcntl in tzcntl %rax,%rax + if let Some(part) = instr.parts.get(0) { + // Truncates the instruction with the length of the expected + // instruction: tzcntl => tzcnt and compares that. + if part.starts_with(expected) { + return + } + } + } + + // Help debug by printing out the found disassembly, and then panic as we + // didn't find the instruction. + println!("disassembly for {}: ", sym.as_ref().unwrap()); + for (i, instr) in function.instrs.iter().enumerate() { + print!("\t{:2}: ", i); + for part in instr.parts.iter() { + print!("{} ", part); + } + println!(""); + } + panic!("failed to find instruction `{}` in the disassembly", expected); +} diff --git a/library/stdarch/src/arm/mod.rs b/library/stdarch/src/arm/mod.rs new file mode 100644 index 000000000000..9472441ae4fe --- /dev/null +++ b/library/stdarch/src/arm/mod.rs @@ -0,0 +1,10 @@ +//! ARM intrinsics. +pub use self::v6::*; +pub use self::v7::*; +#[cfg(target_arch = "aarch64")] +pub use self::v8::*; + +mod v6; +mod v7; +#[cfg(target_arch = "aarch64")] +mod v8; diff --git a/library/stdarch/src/arm/v6.rs b/library/stdarch/src/arm/v6.rs new file mode 100644 index 000000000000..95442b374f8c --- /dev/null +++ b/library/stdarch/src/arm/v6.rs @@ -0,0 +1,25 @@ +//! ARMv6 intrinsics. +//! +//! The reference is [ARMv6-M Architecture Reference +//! Manual](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0419c/index.html). + +/// Reverse the order of the bytes. +#[inline(always)] +#[cfg_attr(test, assert_instr(rev))] +pub fn _rev_u8(x: u8) -> u8 { + x.swap_bytes() as u8 +} + +/// Reverse the order of the bytes. +#[inline(always)] +#[cfg_attr(test, assert_instr(rev))] +pub fn _rev_u16(x: u16) -> u16 { + x.swap_bytes() as u16 +} + +/// Reverse the order of the bytes. +#[inline(always)] +#[cfg_attr(test, assert_instr(rev))] +pub fn _rev_u32(x: u32) -> u32 { + x.swap_bytes() as u32 +} diff --git a/library/stdarch/src/arm/v7.rs b/library/stdarch/src/arm/v7.rs new file mode 100644 index 000000000000..1052b8477a92 --- /dev/null +++ b/library/stdarch/src/arm/v7.rs @@ -0,0 +1,40 @@ +//! ARMv7 intrinsics. +//! +//! The reference is [ARMv7-M Architecture Reference Manual (Issue +//! E.b)](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0403e.b/index.html). + +pub use super::v6::*; + +/// Count Leading Zeros. +#[inline(always)] +#[cfg_attr(test, assert_instr(clz))] +pub fn _clz_u8(x: u8) -> u8 { + x.leading_zeros() as u8 +} + +/// Count Leading Zeros. +#[inline(always)] +#[cfg_attr(test, assert_instr(clz))] +pub fn _clz_u16(x: u16) -> u16 { + x.leading_zeros() as u16 +} + +/// Count Leading Zeros. +#[inline(always)] +#[cfg_attr(test, assert_instr(clz))] +pub fn _clz_u32(x: u32) -> u32 { + x.leading_zeros() as u32 +} + +#[allow(dead_code)] +extern "C" { + #[link_name="llvm.bitreverse.i32"] + fn rbit_u32(i: i32) -> i32; +} + +/// Reverse the bit order. +#[inline(always)] +#[cfg_attr(test, assert_instr(rbit))] +pub fn _rbit_u32(x: u32) -> u32 { + unsafe { rbit_u32(x as i32) as u32 } +} diff --git a/library/stdarch/src/arm/v8.rs b/library/stdarch/src/arm/v8.rs new file mode 100644 index 000000000000..e49ca4fe1f25 --- /dev/null +++ b/library/stdarch/src/arm/v8.rs @@ -0,0 +1,54 @@ +//! ARMv8 intrinsics. +//! +//! The reference is [ARMv8-A Reference Manual](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0487a.k_10775/index.html). + +pub use super::v7::*; + +/// Reverse the order of the bytes. +#[inline(always)] +#[cfg_attr(test, assert_instr(rev))] +pub fn _rev_u64(x: u64) -> u64 { + x.swap_bytes() as u64 +} + +/// Count Leading Zeros. +#[inline(always)] +#[cfg_attr(test, assert_instr(clz))] +pub fn _clz_u64(x: u64) -> u64 { + x.leading_zeros() as u64 +} + +#[allow(dead_code)] +extern "C" { + #[link_name="llvm.bitreverse.i64"] + fn rbit_u64(i: i64) -> i64; +} + +/// Reverse the bit order. +#[inline(always)] +#[cfg_attr(test, assert_instr(rbit))] +pub fn _rbit_u64(x: u64) -> u64 { + unsafe { rbit_u64(x as i64) as u64 } +} + +/// Counts the leading most significant bits set. +/// +/// When all bits of the operand are set it returns the size of the operand in +/// bits. +#[inline(always)] +// LLVM Bug (should be cls): https://bugs.llvm.org/show_bug.cgi?id=31802 +#[cfg_attr(test, assert_instr(clz))] +pub fn _cls_u32(x: u32) -> u32 { + u32::leading_zeros(!x) as u32 +} + +/// Counts the leading most significant bits set. +/// +/// When all bits of the operand are set it returns the size of the operand in +/// bits. +#[inline(always)] +// LLVM Bug (should be cls): https://bugs.llvm.org/show_bug.cgi?id=31802 +#[cfg_attr(test, assert_instr(clz))] +pub fn _cls_u64(x: u64) -> u64 { + u64::leading_zeros(!x) as u64 +} diff --git a/library/stdarch/src/lib.rs b/library/stdarch/src/lib.rs index 1aa713742844..2e75c3e833d0 100644 --- a/library/stdarch/src/lib.rs +++ b/library/stdarch/src/lib.rs @@ -1,8 +1,12 @@ #![allow(dead_code)] #![feature( const_fn, link_llvm_intrinsics, platform_intrinsics, repr_simd, simd_ffi, - target_feature, + target_feature, cfg_target_feature, i128_type )] +#![cfg_attr(test, feature(proc_macro))] + +#[cfg(test)] +extern crate assert_instr; /// Platform independent SIMD vector types and operations. pub mod simd { @@ -16,6 +20,9 @@ pub mod simd { pub mod vendor { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub use x86::*; + + #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] + pub use arm::*; } #[macro_use] @@ -27,3 +34,6 @@ mod v512; mod v64; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] mod x86; + +#[cfg(any(target_arch = "arm", target_arch = "aarch64"))] + mod arm; diff --git a/library/stdarch/src/x86/abm.rs b/library/stdarch/src/x86/abm.rs new file mode 100644 index 000000000000..19f50de2190f --- /dev/null +++ b/library/stdarch/src/x86/abm.rs @@ -0,0 +1,71 @@ +//! Advanced Bit Manipulation (ABM) instructions +//! +//! The POPCNT and LZCNT have their own CPUID bits to indicate support. +//! +//! The references are: +//! +//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2: Instruction Set Reference, A-Z](http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf). +//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and System Instructions](http://support.amd.com/TechDocs/24594.pdf). +//! +//! [Wikipedia](https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29) +//! provides a quick overview of the instructions available. + +#[cfg(test)] +use assert_instr::assert_instr; + +/// Counts the leading most significant zero bits. +/// +/// When the operand is zero, it returns its size in bits. +#[inline(always)] +#[target_feature = "+lzcnt"] +#[cfg_attr(test, assert_instr(lzcnt))] +pub fn _lzcnt_u32(x: u32) -> u32 { x.leading_zeros() } + +/// Counts the leading most significant zero bits. +/// +/// When the operand is zero, it returns its size in bits. +#[inline(always)] +#[target_feature = "+lzcnt"] +#[cfg_attr(test, assert_instr(lzcnt))] +pub fn _lzcnt_u64(x: u64) -> u64 { x.leading_zeros() as u64 } + +/// Counts the bits that are set. +#[inline(always)] +#[target_feature = "+popcnt"] +#[cfg_attr(test, assert_instr(popcnt))] +pub fn _popcnt32(x: u32) -> u32 { x.count_ones() } + +/// Counts the bits that are set. +#[inline(always)] +#[target_feature = "+popcnt"] +#[cfg_attr(test, assert_instr(popcnt))] +pub fn _popcnt64(x: u64) -> u64 { x.count_ones() as u64 } + +#[cfg(all(test, target_feature = "bmi", any(target_arch = "x86", target_arch = "x86_64")))] +mod tests { + use x86::abm; + + #[test] + #[target_feature = "+lzcnt"] + fn _lzcnt_u32() { + assert_eq!(abm::_lzcnt_u32(0b0101_1010u32), 25u32); + } + + #[test] + #[target_feature = "+lzcnt"] + fn _lzcnt_u64() { + assert_eq!(abm::_lzcnt_u64(0b0101_1010u64), 57u64); + } + + #[test] + #[target_feature = "+popcnt"] + fn _popcnt32() { + assert_eq!(abm::_popcnt32(0b0101_1010u32), 4); + } + + #[test] + #[target_feature = "+popcnt"] + fn _popcnt64() { + assert_eq!(abm::_popcnt64(0b0101_1010u64), 4); + } +} diff --git a/library/stdarch/src/x86/avx.rs b/library/stdarch/src/x86/avx.rs index 6ec764c3776b..7b23d1e6cde5 100644 --- a/library/stdarch/src/x86/avx.rs +++ b/library/stdarch/src/x86/avx.rs @@ -31,7 +31,7 @@ extern "C" { } -#[cfg(test)] +#[cfg(all(test, target_feature = "avx", any(target_arch = "x86", target_arch = "x86_64")))] mod tests { use v256::*; use x86::avx; @@ -65,7 +65,4 @@ mod tests { let e = f64x4::new(-4.0,8.0,-4.0,12.0); assert_eq!(r, e); } - - - -} \ No newline at end of file +} diff --git a/library/stdarch/src/x86/avx2.rs b/library/stdarch/src/x86/avx2.rs index e07f26a67dec..ac81ccb9dea1 100644 --- a/library/stdarch/src/x86/avx2.rs +++ b/library/stdarch/src/x86/avx2.rs @@ -1044,7 +1044,7 @@ extern "C" { } -#[cfg(test)] +#[cfg(all(test, target_feature = "avx2", any(target_arch = "x86", target_arch = "x86_64")))] mod tests { use v256::*; use v128::*; diff --git a/library/stdarch/src/x86/bmi.rs b/library/stdarch/src/x86/bmi.rs new file mode 100644 index 000000000000..ae5dbf223801 --- /dev/null +++ b/library/stdarch/src/x86/bmi.rs @@ -0,0 +1,288 @@ +//! Bit Manipulation Instruction (BMI) Set 1.0. +//! +//! The reference is [Intel 64 and IA-32 Architectures Software Developer's +//! Manual Volume 2: Instruction Set Reference, +//! A-Z](http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf). +//! +//! [Wikipedia](https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#BMI1_.28Bit_Manipulation_Instruction_Set_1.29) +//! provides a quick overview of the available instructions. + +#[cfg(test)] +use assert_instr::assert_instr; + +#[allow(dead_code)] +extern "C" { + #[link_name="llvm.x86.bmi.bextr.32"] + fn x86_bmi_bextr_32(x: u32, y: u32) -> u32; + #[link_name="llvm.x86.bmi.bextr.64"] + fn x86_bmi_bextr_64(x: u64, y: u64) -> u64; +} + +/// Extracts bits in range [`start`, `start` + `length`) from `a` into +/// the least significant bits of the result. +#[inline(always)] +#[target_feature = "+bmi"] +#[cfg_attr(test, assert_instr(bextr))] +pub fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 { + _bextr2_u32(a, (start & 0xffu32) | ((len & 0xffu32) << 8u32)) +} + +/// Extracts bits in range [`start`, `start` + `length`) from `a` into +/// the least significant bits of the result. +#[inline(always)] +#[target_feature = "+bmi"] +#[cfg_attr(test, assert_instr(bextr))] +pub fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 { + _bextr2_u64(a, (start & 0xffu64) | ((len & 0xffu64) << 8u64)) +} + +/// Extracts bits of `a` specified by `control` into +/// the least significant bits of the result. +/// +/// Bits [7,0] of `control` specify the index to the first bit in the range to be +/// extracted, and bits [15,8] specify the length of the range. +#[inline(always)] +#[target_feature = "+bmi"] +#[cfg_attr(test, assert_instr(bextr))] +pub fn _bextr2_u32(a: u32, control: u32) -> u32 { + unsafe { x86_bmi_bextr_32(a, control) } +} + +/// Extracts bits of `a` specified by `control` into +/// the least significant bits of the result. +/// +/// Bits [7,0] of `control` specify the index to the first bit in the range to be +/// extracted, and bits [15,8] specify the length of the range. +#[inline(always)] +#[target_feature = "+bmi"] +#[cfg_attr(test, assert_instr(bextr))] +pub fn _bextr2_u64(a: u64, control: u64) -> u64 { + unsafe { x86_bmi_bextr_64(a, control) } +} + +/// Bitwise logical `AND` of inverted `a` with `b`. +#[inline(always)] +#[target_feature = "+bmi"] +#[cfg_attr(test, assert_instr(andn))] +pub fn _andn_u32(a: u32, b: u32) -> u32 { + !a & b +} + +/// Bitwise logical `AND` of inverted `a` with `b`. +#[inline(always)] +#[target_feature = "+bmi"] +#[cfg_attr(test, assert_instr(andn))] +pub fn _andn_u64(a: u64, b: u64) -> u64 { + !a & b +} + +/// Extract lowest set isolated bit. +#[inline(always)] +#[target_feature = "+bmi"] +#[cfg_attr(test, assert_instr(blsi))] +pub fn _blsi_u32(x: u32) -> u32 { + x & x.wrapping_neg() +} + +/// Extract lowest set isolated bit. +#[inline(always)] +#[target_feature = "+bmi"] +#[cfg_attr(test, assert_instr(blsi))] +pub fn _blsi_u64(x: u64) -> u64 { + x & x.wrapping_neg() +} + +/// Get mask up to lowest set bit. +#[inline(always)] +#[target_feature = "+bmi"] +#[cfg_attr(test, assert_instr(blsmsk))] +pub fn _blsmsk_u32(x: u32) -> u32 { + x ^ (x.wrapping_sub(1u32)) +} + +/// Get mask up to lowest set bit. +#[inline(always)] +#[target_feature = "+bmi"] +#[cfg_attr(test, assert_instr(blsmsk))] +pub fn _blsmsk_u64(x: u64) -> u64 { + x ^ (x.wrapping_sub(1u64)) +} + +/// Resets the lowest set bit of `x`. +/// +/// If `x` is sets CF. +#[inline(always)] +#[target_feature = "+bmi"] +#[cfg_attr(test, assert_instr(blsr))] +pub fn _blsr_u32(x: u32) -> u32 { + x & (x.wrapping_sub(1)) +} + +/// Resets the lowest set bit of `x`. +/// +/// If `x` is sets CF. +#[inline(always)] +#[target_feature = "+bmi"] +#[cfg_attr(test, assert_instr(blsr))] +pub fn _blsr_u64(x: u64) -> u64 { + x & (x.wrapping_sub(1)) +} + +/// Counts the number of trailing least significant zero bits. +/// +/// When the source operand is 0, it returns its size in bits. +#[inline(always)] +#[target_feature = "+bmi"] +#[cfg_attr(test, assert_instr(tzcnt))] +pub fn _tzcnt_u16(x: u16) -> u16 { + x.trailing_zeros() as u16 +} + +/// Counts the number of trailing least significant zero bits. +/// +/// When the source operand is 0, it returns its size in bits. +#[inline(always)] +#[target_feature = "+bmi"] +#[cfg_attr(test, assert_instr(tzcnt))] +pub fn _tzcnt_u32(x: u32) -> u32 { + x.trailing_zeros() +} + +/// Counts the number of trailing least significant zero bits. +/// +/// When the source operand is 0, it returns its size in bits. +#[inline(always)] +#[target_feature = "+bmi"] +#[cfg_attr(test, assert_instr(tzcnt))] +pub fn _tzcnt_u64(x: u64) -> u64 { + x.trailing_zeros() as u64 +} + +/// Counts the number of trailing least significant zero bits. +/// +/// When the source operand is 0, it returns its size in bits. +#[inline(always)] +#[target_feature = "+bmi"] +#[cfg_attr(test, assert_instr(tzcnt))] +pub fn _mm_tzcnt_u32(x: u32) -> u32 { + x.trailing_zeros() +} + +/// Counts the number of trailing least significant zero bits. +/// +/// When the source operand is 0, it returns its size in bits. +#[inline(always)] +#[target_feature = "+bmi"] +#[cfg_attr(test, assert_instr(tzcnt))] +pub fn _mm_tzcnt_u64(x: u64) -> u64 { + x.trailing_zeros() as u64 +} + +#[cfg(all(test, target_feature = "bmi", any(target_arch = "x86", target_arch = "x86_64")))] +mod tests { + use x86::bmi; + + #[test] + #[target_feature = "+bmi"] + fn _bextr_u32() { + assert_eq!(bmi::_bextr_u32(0b0101_0000u32, 4, 4), 0b0000_0101u32); + } + + #[test] + #[target_feature = "+bmi"] + fn _bextr_u64() { + assert_eq!(bmi::_bextr_u64(0b0101_0000u64, 4, 4), 0b0000_0101u64); + } + + #[test] + #[target_feature = "+bmi"] + fn _andn_u32() { + assert_eq!(bmi::_andn_u32(0, 0), 0); + assert_eq!(bmi::_andn_u32(0, 1), 1); + assert_eq!(bmi::_andn_u32(1, 0), 0); + assert_eq!(bmi::_andn_u32(1, 1), 0); + + assert_eq!(bmi::_andn_u32(0b0000_0000u32, 0b0000_0000u32), 0b0000_0000u32); + assert_eq!(bmi::_andn_u32(0b0000_0000u32, 0b1111_1111u32), 0b1111_1111u32); + assert_eq!(bmi::_andn_u32(0b1111_1111u32, 0b0000_0000u32), 0b0000_0000u32); + assert_eq!(bmi::_andn_u32(0b1111_1111u32, 0b1111_1111u32), 0b0000_0000u32); + assert_eq!(bmi::_andn_u32(0b0100_0000u32, 0b0101_1101u32), 0b0001_1101u32); + } + + #[test] + #[target_feature = "+bmi"] + fn _andn_u64() { + assert_eq!(bmi::_andn_u64(0, 0), 0); + assert_eq!(bmi::_andn_u64(0, 1), 1); + assert_eq!(bmi::_andn_u64(1, 0), 0); + assert_eq!(bmi::_andn_u64(1, 1), 0); + + assert_eq!(bmi::_andn_u64(0b0000_0000u64, 0b0000_0000u64), 0b0000_0000u64); + assert_eq!(bmi::_andn_u64(0b0000_0000u64, 0b1111_1111u64), 0b1111_1111u64); + assert_eq!(bmi::_andn_u64(0b1111_1111u64, 0b0000_0000u64), 0b0000_0000u64); + assert_eq!(bmi::_andn_u64(0b1111_1111u64, 0b1111_1111u64), 0b0000_0000u64); + assert_eq!(bmi::_andn_u64(0b0100_0000u64, 0b0101_1101u64), 0b0001_1101u64); + } + + #[test] + #[target_feature = "+bmi"] + fn _blsi_u32() { + assert_eq!(bmi::_blsi_u32(0b1101_0000u32), 0b0001_0000u32); + } + + #[test] + #[target_feature = "+bmi"] + fn _blsi_u64() { + assert_eq!(bmi::_blsi_u64(0b1101_0000u64), 0b0001_0000u64); + } + + #[test] + #[target_feature = "+bmi"] + fn _blsmsk_u32() { + assert_eq!(bmi::_blsmsk_u32(0b0011_0000u32), 0b0001_1111u32); + } + + #[test] + #[target_feature = "+bmi"] + fn _blsmsk_u64() { + assert_eq!(bmi::_blsmsk_u64(0b0011_0000u64), 0b0001_1111u64); + } + + #[test] + #[target_feature = "+bmi"] + fn _blsr_u32() { + /// TODO: test the behavior when the input is 0 + assert_eq!(bmi::_blsr_u32(0b0011_0000u32), 0b0010_0000u32); + } + + #[test] + #[target_feature = "+bmi"] + fn _blsr_u64() { + /// TODO: test the behavior when the input is 0 + assert_eq!(bmi::_blsr_u64(0b0011_0000u64), 0b0010_0000u64); + } + + #[test] + #[target_feature = "+bmi"] + fn _tzcnt_u16() { + assert_eq!(bmi::_tzcnt_u16(0b0000_0001u16), 0u16); + assert_eq!(bmi::_tzcnt_u16(0b0000_0000u16), 16u16); + assert_eq!(bmi::_tzcnt_u16(0b1001_0000u16), 4u16); + } + + #[test] + #[target_feature = "+bmi"] + fn _tzcnt_u32() { + assert_eq!(bmi::_tzcnt_u32(0b0000_0001u32), 0u32); + assert_eq!(bmi::_tzcnt_u32(0b0000_0000u32), 32u32); + assert_eq!(bmi::_tzcnt_u32(0b1001_0000u32), 4u32); + } + + #[test] + #[target_feature = "+bmi"] + fn _tzcnt_u64() { + assert_eq!(bmi::_tzcnt_u64(0b0000_0001u64), 0u64); + assert_eq!(bmi::_tzcnt_u64(0b0000_0000u64), 64u64); + assert_eq!(bmi::_tzcnt_u64(0b1001_0000u64), 4u64); + } +} diff --git a/library/stdarch/src/x86/bmi2.rs b/library/stdarch/src/x86/bmi2.rs new file mode 100644 index 000000000000..67f8740399e4 --- /dev/null +++ b/library/stdarch/src/x86/bmi2.rs @@ -0,0 +1,215 @@ +//! Bit Manipulation Instruction (BMI) Set 2.0. +//! +//! The reference is [Intel 64 and IA-32 Architectures Software Developer's +//! Manual Volume 2: Instruction Set Reference, +//! A-Z](http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectu res-software-developer-instruction-set-reference-manual-325383.pdf). +//! +//! [Wikipedia](https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#BMI2_.28Bit_Manipulation_Instruction_Set_2.29) +//! provides a quick overview of the available instructions. + +#[cfg(test)] +use assert_instr::assert_instr; + +/// Unsigned multiply without affecting flags. +/// +/// Unsigned multiplication of `a` with `b` returning a pair `(lo, hi)` with +/// the low half and the high half of the result. +#[inline(always)] +// LLVM BUG (should be mulxl): https://bugs.llvm.org/show_bug.cgi?id=34232 +#[cfg_attr(test, assert_instr(imul))] +#[target_feature = "+bmi2"] +pub fn _mulx_u32(a: u32, b: u32) -> (u32, u32) { + let result: u64 = (a as u64) * (b as u64); + let hi = (result >> 32) as u32; + (result as u32, hi) +} + +/// Unsigned multiply without affecting flags. +/// +/// Unsigned multiplication of `a` with `b` returning a pair `(lo, hi)` with +/// the low half and the high half of the result. +#[inline(always)] +#[cfg_attr(test, assert_instr(mulx))] +#[target_feature = "+bmi2"] +pub fn _mulx_u64(a: u64, b: u64) -> (u64, u64) { + let result: u128 = (a as u128) * (b as u128); + let hi = (result >> 64) as u64; + (result as u64, hi) +} + +#[allow(dead_code)] +extern "C" { + #[link_name="llvm.x86.bmi.bzhi.32"] + fn x86_bmi2_bzhi_32(x: u32, y: u32) -> u32; + #[link_name="llvm.x86.bmi.bzhi.64"] + fn x86_bmi2_bzhi_64(x: u64, y: u64) -> u64; + #[link_name="llvm.x86.bmi.pdep.32"] + fn x86_bmi2_pdep_32(x: u32, y: u32) -> u32; + #[link_name="llvm.x86.bmi.pdep.64"] + fn x86_bmi2_pdep_64(x: u64, y: u64) -> u64; + #[link_name="llvm.x86.bmi.pext.32"] + fn x86_bmi2_pext_32(x: u32, y: u32) -> u32; + #[link_name="llvm.x86.bmi.pext.64"] + fn x86_bmi2_pext_64(x: u64, y: u64) -> u64; +} + + +/// Zero higher bits of `a` >= `index`. +#[inline(always)] +#[target_feature = "+bmi2"] +#[cfg_attr(test, assert_instr(bzhi))] +pub fn _bzhi_u32(a: u32, index: u32) -> u32 { + unsafe { x86_bmi2_bzhi_32(a, index) } +} + +/// Zero higher bits of `a` >= `index`. +#[inline(always)] +#[target_feature = "+bmi2"] +#[cfg_attr(test, assert_instr(bzhi))] +pub fn _bzhi_u64(a: u64, index: u64) -> u64 { + unsafe { x86_bmi2_bzhi_64(a, index) } +} + + +/// Scatter contiguous low order bits of `a` to the result at the positions +/// specified by the `mask`. +#[inline(always)] +#[target_feature = "+bmi2"] +#[cfg_attr(test, assert_instr(pdep))] +pub fn _pdep_u32(a: u32, mask: u32) -> u32 { + unsafe { x86_bmi2_pdep_32(a, mask) } +} + +/// Scatter contiguous low order bits of `a` to the result at the positions +/// specified by the `mask`. +#[inline(always)] +#[target_feature = "+bmi2"] +#[cfg_attr(test, assert_instr(pdep))] +pub fn _pdep_u64(a: u64, mask: u64) -> u64 { + unsafe { x86_bmi2_pdep_64(a, mask) } +} + +/// Gathers the bits of `x` specified by the `mask` into the contiguous low +/// order bit positions of the result. +#[inline(always)] +#[target_feature = "+bmi2"] +#[cfg_attr(test, assert_instr(pext))] +pub fn _pext_u32(a: u32, mask: u32) -> u32 { + unsafe { x86_bmi2_pext_32(a, mask) } +} + +/// Gathers the bits of `x` specified by the `mask` into the contiguous low +/// order bit positions of the result. +#[inline(always)] +#[target_feature = "+bmi2"] +#[cfg_attr(test, assert_instr(pext))] +pub fn _pext_u64(a: u64, mask: u64) -> u64 { + unsafe { x86_bmi2_pext_64(a, mask) } +} + +#[cfg(all(test, target_feature = "bmi2", any(target_arch = "x86", target_arch = "x86_64")))] +mod tests { + use x86::bmi2; + + #[test] + #[target_feature = "+bmi2"] + fn _pext_u32() { + let n = 0b1011_1110_1001_0011u32; + + let m0 = 0b0110_0011_1000_0101u32; + let s0 = 0b0000_0000_0011_0101u32; + + let m1 = 0b1110_1011_1110_1111u32; + let s1 = 0b0001_0111_0100_0011u32; + + assert_eq!(bmi2::_pext_u32(n, m0), s0); + assert_eq!(bmi2::_pext_u32(n, m1), s1); + } + + #[test] + #[target_feature = "+bmi2"] + fn _pext_u64() { + let n = 0b1011_1110_1001_0011u64; + + let m0 = 0b0110_0011_1000_0101u64; + let s0 = 0b0000_0000_0011_0101u64; + + let m1 = 0b1110_1011_1110_1111u64; + let s1 = 0b0001_0111_0100_0011u64; + + assert_eq!(bmi2::_pext_u64(n, m0), s0); + assert_eq!(bmi2::_pext_u64(n, m1), s1); + } + + #[test] + #[target_feature = "+bmi2"] + fn _pdep_u32() { + let n = 0b1011_1110_1001_0011u32; + + let m0 = 0b0110_0011_1000_0101u32; + let s0 = 0b0000_0010_0000_0101u32; + + let m1 = 0b1110_1011_1110_1111u32; + let s1 = 0b1110_1001_0010_0011u32; + + assert_eq!(bmi2::_pdep_u32(n, m0), s0); + assert_eq!(bmi2::_pdep_u32(n, m1), s1); + } + + #[test] + #[target_feature = "+bmi2"] + fn _pdep_u64() { + let n = 0b1011_1110_1001_0011u64; + + let m0 = 0b0110_0011_1000_0101u64; + let s0 = 0b0000_0010_0000_0101u64; + + let m1 = 0b1110_1011_1110_1111u64; + let s1 = 0b1110_1001_0010_0011u64; + + assert_eq!(bmi2::_pdep_u64(n, m0), s0); + assert_eq!(bmi2::_pdep_u64(n, m1), s1); + } + + #[test] + #[target_feature = "+bmi2"] + fn _bzhi_u32() { + let n = 0b1111_0010u32; + let s = 0b0001_0010u32; + assert_eq!(bmi2::_bzhi_u32(n, 5), s); + } + + #[test] + #[target_feature = "+bmi2"] + fn _bzhi_u64() { + let n = 0b1111_0010u64; + let s = 0b0001_0010u64; + assert_eq!(bmi2::_bzhi_u64(n, 5), s); + } + + #[test] + #[target_feature = "+bmi2"] + fn _mulx_u32() { + let a: u32 = 4_294_967_200; + let b: u32 = 2; + let (lo, hi): (u32, u32) = bmi2::_mulx_u32(a, b); + // result = 8589934400 + // = 0b0001_1111_1111_1111_1111_1111_1111_0100_0000u64 + // ^~hi ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + assert_eq!(lo, 0b1111_1111_1111_1111_1111_1111_0100_0000u32); + assert_eq!(hi, 0b0001u32); + } + + #[test] + #[target_feature = "+bmi2"] + fn _mulx_u64() { + let a: u64 = 9_223_372_036_854_775_800; + let b: u64 = 100; + let (lo, hi): (u64, u64) = bmi2::_mulx_u64(a, b); + // result = 922337203685477580000 + // = 0b00110001_11111111_11111111_11111111_11111111_11111111_11111111_11111100_11100000u128 + // ^~hi~~~~ ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + assert_eq!(lo, 0b11111111_11111111_11111111_11111111_11111111_11111111_11111100_11100000u64); + assert_eq!(hi, 0b00110001u64); + } +} diff --git a/library/stdarch/src/x86/mod.rs b/library/stdarch/src/x86/mod.rs index f71698af7a7e..c840ffff25ac 100644 --- a/library/stdarch/src/x86/mod.rs +++ b/library/stdarch/src/x86/mod.rs @@ -6,6 +6,11 @@ pub use self::sse42::*; pub use self::avx::*; pub use self::avx2::*; +pub use self::abm::*; +pub use self::bmi::*; +pub use self::bmi2::*; +pub use self::tbm::*; + #[allow(non_camel_case_types)] pub type __m128i = ::v128::i8x16; #[allow(non_camel_case_types)] @@ -20,3 +25,8 @@ mod sse41; mod sse42; mod avx; mod avx2; + +mod abm; +mod bmi; +mod bmi2; +mod tbm; diff --git a/library/stdarch/src/x86/sse.rs b/library/stdarch/src/x86/sse.rs index c03735624c75..cfb616a1645a 100644 --- a/library/stdarch/src/x86/sse.rs +++ b/library/stdarch/src/x86/sse.rs @@ -1,9 +1,14 @@ +use simd_llvm::simd_shuffle4; use v128::*; +#[cfg(test)] +use assert_instr::assert_instr; + /// Return the square root of packed single-precision (32-bit) floating-point /// elements in `a`. #[inline(always)] #[target_feature = "+sse"] +#[cfg_attr(test, assert_instr(sqrtps))] pub fn _mm_sqrt_ps(a: f32x4) -> f32x4 { unsafe { sqrtps(a) } } @@ -12,6 +17,7 @@ pub fn _mm_sqrt_ps(a: f32x4) -> f32x4 { /// floating-point elements in `a`. #[inline(always)] #[target_feature = "+sse"] +#[cfg_attr(test, assert_instr(rcpps))] pub fn _mm_rcp_ps(a: f32x4) -> f32x4 { unsafe { rcpps(a) } } @@ -20,6 +26,7 @@ pub fn _mm_rcp_ps(a: f32x4) -> f32x4 { /// (32-bit) floating-point elements in `a`. #[inline(always)] #[target_feature = "+sse"] +#[cfg_attr(test, assert_instr(rsqrtps))] pub fn _mm_rsqrt_ps(a: f32x4) -> f32x4 { unsafe { rsqrtps(a) } } @@ -28,6 +35,7 @@ pub fn _mm_rsqrt_ps(a: f32x4) -> f32x4 { /// `b`, and return the corresponding minimum values. #[inline(always)] #[target_feature = "+sse"] +#[cfg_attr(test, assert_instr(minps))] pub fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 { unsafe { minps(a, b) } } @@ -36,16 +44,26 @@ pub fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 { /// `b`, and return the corresponding maximum values. #[inline(always)] #[target_feature = "+sse"] +#[cfg_attr(test, assert_instr(maxps))] pub fn _mm_max_ps(a: f32x4, b: f32x4) -> f32x4 { unsafe { maxps(a, b) } } +/// Unpack and interleave single-precision (32-bit) floating-point elements +/// from the high half of `a` and `b`; +#[inline(always)] +#[target_feature = "+sse"] +pub fn _mm_unpackhi_ps(a: f32x4, b: f32x4) -> f32x4 { + unsafe { simd_shuffle4(a, b, [2, 6, 3, 7]) } +} + /// Return a mask of the most significant bit of each element in `a`. /// /// The mask is stored in the 4 least significant bits of the return value. /// All other bits are set to `0`. #[inline(always)] #[target_feature = "+sse"] +#[cfg_attr(test, assert_instr(movmskps))] pub fn _mm_movemask_ps(a: f32x4) -> i32 { unsafe { movmskps(a) } } @@ -66,7 +84,7 @@ extern { fn movmskps(a: f32x4) -> i32; } -#[cfg(test)] +#[cfg(all(test, target_feature = "sse", any(target_arch = "x86", target_arch = "x86_64")))] mod tests { use v128::*; use x86::sse; @@ -116,6 +134,15 @@ mod tests { assert_eq!(r, f32x4::new(-1.0, 20.0, 0.0, -5.0)); } + #[test] + #[target_feature = "+sse"] + fn _mm_unpackhi_ps() { + let a = f32x4::new(1.0, 2.0, 3.0, 4.0); + let b = f32x4::new(5.0, 6.0, 7.0, 8.0); + let r = sse::_mm_unpackhi_ps(a, b); + assert_eq!(r, f32x4::new(3.0, 7.0, 4.0, 8.0)); + } + #[test] #[target_feature = "+sse"] fn _mm_movemask_ps() { diff --git a/library/stdarch/src/x86/sse2.rs b/library/stdarch/src/x86/sse2.rs index b564677ea786..e67c96518061 100644 --- a/library/stdarch/src/x86/sse2.rs +++ b/library/stdarch/src/x86/sse2.rs @@ -9,6 +9,9 @@ use x86::__m128i; use v128::*; use v64::*; +#[cfg(test)] +use assert_instr::assert_instr; + /// Provide a hint to the processor that the code sequence is a spin-wait loop. /// /// This can help improve the performance and power consumption of spin-wait @@ -89,6 +92,7 @@ pub fn _mm_adds_epi8(a: i8x16, b: i8x16) -> i8x16 { /// Add packed 16-bit integers in `a` and `b` using saturation. #[inline(always)] #[target_feature = "+sse2"] +#[cfg_attr(test, assert_instr(paddsw))] pub fn _mm_adds_epi16(a: i16x8, b: i16x8) -> i16x8 { unsafe { paddsw(a, b) } } @@ -1716,7 +1720,7 @@ extern { fn movmskpd(a: f64x2) -> i32; } -#[cfg(test)] +#[cfg(all(test, target_feature = "sse2", any(target_arch = "x86", target_arch = "x86_64")))] mod tests { use std::os::raw::c_void; diff --git a/library/stdarch/src/x86/sse41.rs b/library/stdarch/src/x86/sse41.rs index 6dac82d25e32..955026e2b4d0 100644 --- a/library/stdarch/src/x86/sse41.rs +++ b/library/stdarch/src/x86/sse41.rs @@ -57,7 +57,7 @@ extern { fn dpps(a: f32x4, b: f32x4, imm8: u8) -> f32x4; } -#[cfg(test)] +#[cfg(all(test, target_feature = "sse4.1", any(target_arch = "x86", target_arch = "x86_64")))] mod tests { use v128::*; use x86::sse41; diff --git a/library/stdarch/src/x86/sse42.rs b/library/stdarch/src/x86/sse42.rs index 6cc120aca02d..7459997f7b5c 100644 --- a/library/stdarch/src/x86/sse42.rs +++ b/library/stdarch/src/x86/sse42.rs @@ -40,7 +40,7 @@ extern { fn pcmpestri128(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8) -> i32; } -#[cfg(test)] +#[cfg(all(test, target_feature = "sse4.2", any(target_arch = "x86", target_arch = "x86_64")))] mod tests { use v128::*; use x86::{__m128i, sse42}; diff --git a/library/stdarch/src/x86/ssse3.rs b/library/stdarch/src/x86/ssse3.rs index b4b9ce21d421..1d1497f5d0d1 100644 --- a/library/stdarch/src/x86/ssse3.rs +++ b/library/stdarch/src/x86/ssse3.rs @@ -50,7 +50,7 @@ extern { fn pshufb128(a: u8x16, b: u8x16) -> u8x16; } -#[cfg(test)] +#[cfg(all(test, target_feature = "ssse3", any(target_arch = "x86", target_arch = "x86_64")))] mod tests { use v128::*; use x86::ssse3 as ssse3; diff --git a/library/stdarch/src/x86/tbm.rs b/library/stdarch/src/x86/tbm.rs new file mode 100644 index 000000000000..213188536a33 --- /dev/null +++ b/library/stdarch/src/x86/tbm.rs @@ -0,0 +1,393 @@ +//! Trailing Bit Manipulation (TBM) instruction set. +//! +//! The reference is [AMD64 Architecture Programmer's Manual, Volume 3: +//! General-Purpose and System +//! Instructions](http://support.amd.com/TechDocs/24594.pdf). +//! +//! [Wikipedia](https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#TBM_.28Trailing_Bit_Manipulation.29) +//! provides a quick overview of the available instructions. + +#[cfg(test)] +use assert_instr::assert_instr; + +// TODO: LLVM-CODEGEN ERROR: LLVM ERROR: Cannot select: intrinsic %llvm.x86.tbm.bextri.u32 +/* +#[allow(dead_code)] +extern "C" { + #[link_name="llvm.x86.tbm.bextri.u32"] + fn x86_tbm_bextri_u32(a: u32, y: u32) -> u32; + #[link_name="llvm.x86.tbm.bextri.u64"] + fn x86_tbm_bextri_u64(x: u64, y: u64) -> u64; +} + +/// Extracts bits in range [`start`, `start` + `length`) from `a` into +/// the least significant bits of the result. +#[inline(always)] +#[target_feature = "+tbm"] +pub fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 { + _bextr2_u32(a, (start & 0xffu32) | ((len & 0xffu32) << 8u32)) +} + +/// Extracts bits in range [`start`, `start` + `length`) from `a` into +/// the least significant bits of the result. +#[inline(always)] +#[target_feature = "+tbm"] +pub fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 { + _bextr2_u64(a, (start & 0xffu64) | ((len & 0xffu64) << 8u64)) +} + +/// Extracts bits of `a` specified by `control` into +/// the least significant bits of the result. +/// +/// Bits [7,0] of `control` specify the index to the first bit in the range to be +/// extracted, and bits [15,8] specify the length of the range. +#[inline(always)] +#[target_feature = "+tbm"] +pub fn _bextr2_u32(a: u32, control: u32) -> u32 { + unsafe { x86_tbm_bextri_u32(a, control) } +} + +/// Extracts bits of `a` specified by `control` into +/// the least significant bits of the result. +/// +/// Bits [7,0] of `control` specify the index to the first bit in the range to be +/// extracted, and bits [15,8] specify the length of the range. +#[inline(always)] +#[target_feature = "+tbm"] +pub fn _bextr2_u64(a: u64, control: u64) -> u64 { + unsafe { x86_tbm_bextri_u64(a, control) } +} +*/ + +/// Clears all bits below the least significant zero bit of `x`. +/// +/// If there is no zero bit in `x`, it returns zero. +#[inline(always)] +#[target_feature = "+tbm"] +#[cfg_attr(test, assert_instr(blcfill))] +pub fn _blcfill_u32(x: u32) -> u32 { + x & (x.wrapping_add(1)) +} + +/// Clears all bits below the least significant zero bit of `x`. +/// +/// If there is no zero bit in `x`, it returns zero. +#[inline(always)] +#[target_feature = "+tbm"] +#[cfg_attr(test, assert_instr(blcfill))] +pub fn _blcfill_u64(x: u64) -> u64 { + x & (x.wrapping_add(1)) +} + +/// Sets all bits of `x` to 1 except for the least significant zero bit. +/// +/// If there is no zero bit in `x`, it sets all bits. +#[inline(always)] +#[target_feature = "+tbm"] +#[cfg_attr(test, assert_instr(blci))] +pub fn _blci_u32(x: u32) -> u32 { + x | !(x.wrapping_add(1)) +} + +/// Sets all bits of `x` to 1 except for the least significant zero bit. +/// +/// If there is no zero bit in `x`, it sets all bits. +#[inline(always)] +#[target_feature = "+tbm"] +#[cfg_attr(test, assert_instr(blci))] +pub fn _blci_u64(x: u64) -> u64 { + x | !(x.wrapping_add(1)) +} + +/// Sets the least significant zero bit of `x` and clears all other bits. +/// +/// If there is no zero bit in `x`, it returns zero. +#[inline(always)] +#[target_feature = "+tbm"] +#[cfg_attr(test, assert_instr(blcic))] +pub fn _blcic_u32(x: u32) -> u32 { + !x & (x.wrapping_add(1)) +} + +/// Sets the least significant zero bit of `x` and clears all other bits. +/// +/// If there is no zero bit in `x`, it returns zero. +#[inline(always)] +#[target_feature = "+tbm"] +#[cfg_attr(test, assert_instr(blcic))] +pub fn _blcic_u64(x: u64) -> u64 { + !x & (x.wrapping_add(1)) +} + +/// Sets the least significant zero bit of `x` and clears all bits above that bit. +/// +/// If there is no zero bit in `x`, it sets all the bits. +#[inline(always)] +#[target_feature = "+tbm"] +#[cfg_attr(test, assert_instr(blcmsk))] +pub fn _blcmsk_u32(x: u32) -> u32 { + x ^ (x.wrapping_add(1)) +} + +/// Sets the least significant zero bit of `x` and clears all bits above that bit. +/// +/// If there is no zero bit in `x`, it sets all the bits. +#[inline(always)] +#[target_feature = "+tbm"] +#[cfg_attr(test, assert_instr(blcmsk))] +pub fn _blcmsk_u64(x: u64) -> u64 { + x ^ (x.wrapping_add(1)) +} + +/// Sets the least significant zero bit of `x`. +/// +/// If there is no zero bit in `x`, it returns `x`. +#[inline(always)] +#[target_feature = "+tbm"] +#[cfg_attr(test, assert_instr(blcs))] +pub fn _blcs_u32(x: u32) -> u32 { + x | (x.wrapping_add(1)) +} + +/// Sets the least significant zero bit of `x`. +/// +/// If there is no zero bit in `x`, it returns `x`. +#[inline(always)] +#[target_feature = "+tbm"] +#[cfg_attr(test, assert_instr(blcs))] +pub fn _blcs_u64(x: u64) -> u64 { + x | x.wrapping_add(1) +} + +/// Sets all bits of `x` below the least significant one. +/// +/// If there is no set bit in `x`, it sets all the bits. +#[inline(always)] +#[target_feature = "+tbm"] +#[cfg_attr(test, assert_instr(blsfill))] +pub fn _blsfill_u32(x: u32) -> u32 { + x | (x.wrapping_sub(1)) +} + +/// Sets all bits of `x` below the least significant one. +/// +/// If there is no set bit in `x`, it sets all the bits. +#[inline(always)] +#[target_feature = "+tbm"] +#[cfg_attr(test, assert_instr(blsfill))] +pub fn _blsfill_u64(x: u64) -> u64 { + x | (x.wrapping_sub(1)) +} + +/// Clears least significant bit and sets all other bits. +/// +/// If there is no set bit in `x`, it sets all the bits. +#[inline(always)] +#[target_feature = "+tbm"] +#[cfg_attr(test, assert_instr(blsic))] +pub fn _blsic_u32(x: u32) -> u32 { + !x | (x.wrapping_sub(1)) +} + +/// Clears least significant bit and sets all other bits. +/// +/// If there is no set bit in `x`, it sets all the bits. +#[inline(always)] +#[target_feature = "+tbm"] +#[cfg_attr(test, assert_instr(blsic))] +pub fn _blsic_u64(x: u64) -> u64 { + !x | (x.wrapping_sub(1)) +} + +/// Clears all bits below the least significant zero of `x` and sets all other +/// bits. +/// +/// If the least significant bit of `x` is 0, it sets all bits. +#[inline(always)] +#[target_feature = "+tbm"] +#[cfg_attr(test, assert_instr(t1mskc))] +pub fn _t1mskc_u32(x: u32) -> u32 { + !x | (x.wrapping_add(1)) +} + +/// Clears all bits below the least significant zero of `x` and sets all other +/// bits. +/// +/// If the least significant bit of `x` is 0, it sets all bits. +#[inline(always)] +#[target_feature = "+tbm"] +#[cfg_attr(test, assert_instr(t1mskc))] +pub fn _t1mskc_u64(x: u64) -> u64 { + !x | (x.wrapping_add(1)) +} + +/// Sets all bits below the least significant one of `x` and clears all other +/// bits. +/// +/// If the least significant bit of `x` is 1, it returns zero. +#[inline(always)] +#[target_feature = "+tbm"] +#[cfg_attr(test, assert_instr(tzmsk))] +pub fn _tzmsk_u32(x: u32) -> u32 { + !x & (x.wrapping_sub(1)) +} + +/// Sets all bits below the least significant one of `x` and clears all other +/// bits. +/// +/// If the least significant bit of `x` is 1, it returns zero. +#[inline(always)] +#[target_feature = "+tbm"] +#[cfg_attr(test, assert_instr(tzmsk))] +pub fn _tzmsk_u64(x: u64) -> u64 { + !x & (x.wrapping_sub(1)) +} + +#[cfg(all(test, target_feature = "tbm", any(target_arch = "x86", target_arch = "x86_64")))] +mod tests { + use x86::tbm; + + /* + #[test] + #[target_feature = "+tbm"] + fn _bextr_u32() { + assert_eq!(tbm::_bextr_u32(0b0101_0000u32, 4, 4), 0b0000_0101u32); + } + + #[test] + #[target_feature = "+tbm"] + fn _bextr_u64() { + assert_eq!(tbm::_bextr_u64(0b0101_0000u64, 4, 4), 0b0000_0101u64); + } + */ + + #[test] + #[target_feature = "+tbm"] + fn _blcfill_u32() { + assert_eq!(tbm::_blcfill_u32(0b0101_0111u32), 0b0101_0000u32); + assert_eq!(tbm::_blcfill_u32(0b1111_1111u32), 0u32); + } + + #[test] + #[target_feature = "+tbm"] + fn _blcfill_u64() { + assert_eq!(tbm::_blcfill_u64(0b0101_0111u64), 0b0101_0000u64); + assert_eq!(tbm::_blcfill_u64(0b1111_1111u64), 0u64); + } + + #[test] + #[target_feature = "+tbm"] + fn _blci_u32() { + assert_eq!(tbm::_blci_u32(0b0101_0000u32), + 0b1111_1111_1111_1111_1111_1111_1111_1110u32); + assert_eq!(tbm::_blci_u32(0b1111_1111u32), + 0b1111_1111_1111_1111_1111_1110_1111_1111u32); + } + + #[test] + #[target_feature = "+tbm"] + fn _blci_u64() { + assert_eq!(tbm::_blci_u64(0b0101_0000u64), + 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110u64); + assert_eq!(tbm::_blci_u64(0b1111_1111u64), + 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110_1111_1111u64); + } + + #[test] + #[target_feature = "+tbm"] + fn _blcic_u32() { + assert_eq!(tbm::_blcic_u32(0b0101_0001u32), 0b0000_0010u32); + assert_eq!(tbm::_blcic_u32(0b1111_1111u32), 0b1_0000_0000u32); + } + + #[test] + #[target_feature = "+tbm"] + fn _blcic_u64() { + assert_eq!(tbm::_blcic_u64(0b0101_0001u64), 0b0000_0010u64); + assert_eq!(tbm::_blcic_u64(0b1111_1111u64), 0b1_0000_0000u64); + } + + #[test] + #[target_feature = "+tbm"] + fn _blcmsk_u32() { + assert_eq!(tbm::_blcmsk_u32(0b0101_0001u32), 0b0000_0011u32); + assert_eq!(tbm::_blcmsk_u32(0b1111_1111u32), 0b1_1111_1111u32); + } + + #[test] + #[target_feature = "+tbm"] + fn _blcmsk_u64() { + assert_eq!(tbm::_blcmsk_u64(0b0101_0001u64), 0b0000_0011u64); + assert_eq!(tbm::_blcmsk_u64(0b1111_1111u64), 0b1_1111_1111u64); + } + + #[test] + #[target_feature = "+tbm"] + fn _blcs_u32() { + assert_eq!(tbm::_blcs_u32(0b0101_0001u32), 0b0101_0011u32); + assert_eq!(tbm::_blcs_u32(0b1111_1111u32), 0b1_1111_1111u32); + } + + #[test] + #[target_feature = "+tbm"] + fn _blcs_u64() { + assert_eq!(tbm::_blcs_u64(0b0101_0001u64), 0b0101_0011u64); + assert_eq!(tbm::_blcs_u64(0b1111_1111u64), 0b1_1111_1111u64); + } + + #[test] + #[target_feature = "+tbm"] + fn _blsfill_u32() { + assert_eq!(tbm::_blsfill_u32(0b0101_0100u32), 0b0101_0111u32); + assert_eq!(tbm::_blsfill_u32(0u32), 0b1111_1111_1111_1111_1111_1111_1111_1111u32); + } + + #[test] + #[target_feature = "+tbm"] + fn _blsfill_u64() { + assert_eq!(tbm::_blsfill_u64(0b0101_0100u64), 0b0101_0111u64); + assert_eq!(tbm::_blsfill_u64(0u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64); + } + + #[test] + #[target_feature = "+tbm"] + fn _blsic_u32() { + assert_eq!(tbm::_blsic_u32(0b0101_0100u32), 0b1111_1111_1111_1111_1111_1111_1111_1011u32); + assert_eq!(tbm::_blsic_u32(0u32), 0b1111_1111_1111_1111_1111_1111_1111_1111u32); + } + + #[test] + #[target_feature = "+tbm"] + fn _blsic_u64() { + assert_eq!(tbm::_blsic_u64(0b0101_0100u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1011u64); + assert_eq!(tbm::_blsic_u64(0u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64); + } + + #[test] + #[target_feature = "+tbm"] + fn _t1mskc_u32() { + assert_eq!(tbm::_t1mskc_u32(0b0101_0111u32), 0b1111_1111_1111_1111_1111_1111_1111_1000u32); + assert_eq!(tbm::_t1mskc_u32(0u32), 0b1111_1111_1111_1111_1111_1111_1111_1111u32); + } + + #[test] + #[target_feature = "+tbm"] + fn _t1mksc_u64() { + assert_eq!(tbm::_t1mskc_u64(0b0101_0111u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1000u64); + assert_eq!(tbm::_t1mskc_u64(0u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64); + } + + #[test] + #[target_feature = "+tbm"] + fn _tzmsk_u32() { + assert_eq!(tbm::_tzmsk_u32(0b0101_1000u32), 0b0000_0111u32); + assert_eq!(tbm::_tzmsk_u32(0b0101_1001u32), 0b0000_0000u32); + } + + #[test] + #[target_feature = "+tbm"] + fn _tzmsk_u64() { + assert_eq!(tbm::_tzmsk_u64(0b0101_1000u64), 0b0000_0111u64); + assert_eq!(tbm::_tzmsk_u64(0b0101_1001u64), 0b0000_0000u64); + } +}