Merge remote-tracking branch 'upstream/master'

2017-09-22 21:00:21 +02:00 · 2017-09-22 21:00:21 +02:00 · ca55004659
commit ca55004659
parent d6dceccb33 119c205632
28 changed files with 1582 additions and 14 deletions
--- a/library/stdarch/.appveyor.yml
+++ b/library/stdarch/.appveyor.yml
@ -0,0 +1,24 @@
+environment:
+  # We don't want to do identical comdat folding as it messes up the ability to
+  # generate lossless backtraces in some cases. This is enabled by rustc by
+  # default so pass a flag to disable it to ensure our tests work ok.
+  RUSTFLAGS: -Clink-args=/OPT:NOICF
+
+  matrix:
+  - TARGET: x86_64-pc-windows-msvc
+
+install:
+  # Install rust, x86_64-pc-windows-msvc host
+  - appveyor-retry appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe
+  - rustup-init.exe -y --default-host x86_64-pc-windows-msvc --default-toolchain nightly
+  - set PATH=%PATH%;C:\Users\appveyor\.cargo\bin
+  - if NOT "%TARGET%" == "x86_64-pc-windows-msvc" rustup target add %TARGET%
+  - rustc -vV
+  - cargo -vV
+
+build: false
+
+test_script:
+  - cargo test --target %TARGET%
+  - set RUST_BACKTRACE=1
+  - cargo test --target %TARGET% --release
--- a/library/stdarch/.travis.yml
+++ b/library/stdarch/.travis.yml
@ -0,0 +1,16 @@
+language: rust
+sudo: false
+
+matrix:
+  include:
+    - rust: nightly
+    - rust: nightly
+      os: osx
+
+script:
+  - cargo test
+  - cargo test --release
+
+notifications:
+  email:
+    on_success: never
--- a/library/stdarch/.vscode/temp.sql
+++ b/library/stdarch/.vscode/temp.sql
--- a/library/stdarch/CONTRIBUTING.md
+++ b/library/stdarch/CONTRIBUTING.md
@ -14,6 +14,7 @@ example for `_mm_adds_epi16`:
 /// Add packed 16-bit integers in `a` and `b` using saturation.
 #[inline(always)]
 #[target_feature = "+sse2"]
+#[cfg_attr(test, assert_instr(paddsw))]
 pub fn _mm_adds_epi16(a: i16x8, b: i16x8) -> i16x8 {
    unsafe { paddsw(a, b) }
 }
@ -32,6 +33,10 @@ Let's break this down:
  support `sse2`, the compiler will still generate code for `_mm_adds_epi16`
  *as if* `sse2` support existed. Without this attribute, the compiler might
  not generate the intended CPU instruction.
+* The `#[cfg_attr(test, assert_instr(paddsw))]` attribute indicates that when
+  we're testing the crate we'll assert that the `paddsw` instruction is
+  generated inside this function, ensuring that the SIMD intrinsic truly is an
+  intrinsic for the instruction!
 * The types of the vectors given to the intrinsic should generally match the
  types as provided in the vendor interface. We'll talk about this more below.
 * The implementation of the vendor intrinsic is generally very simple.
@ -40,7 +45,7 @@ Let's break this down:
  compiler intrinsic (in this case, `paddsw`) when one is available. More on
  this below as well.

-Once a function has been added, you should add at least one test for basic
+Once a function has been added, you should also add at least one test for basic
 functionality. Here's an example for `_mm_adds_epi16`:

 ```rust
--- a/library/stdarch/Cargo.toml
+++ b/library/stdarch/Cargo.toml
@ -13,3 +13,10 @@ license = "MIT"
 [profile.release]
 debug = true
 opt-level = 3
+
+[profile.bench]
+debug = true
+opt-level = 3
+
+[dev-dependencies]
+assert-instr = { path = "assert-instr" }
--- a/library/stdarch/TODO.md
+++ b/library/stdarch/TODO.md
@ -155,7 +155,7 @@ sse
 * [ ] `_mm_storer_ps`
 * [ ] `_mm_move_ss`
 * [ ] `_mm_shuffle_ps`
-* [ ] `_mm_unpackhi_ps`
+* [x] `_mm_unpackhi_ps`
 * [ ] `_mm_unpacklo_ps`
 * [ ] `_mm_movehl_ps`
 * [ ] `_mm_movelh_ps`
--- a/library/stdarch/assert-instr/Cargo.toml
+++ b/library/stdarch/assert-instr/Cargo.toml
@ -0,0 +1,11 @@
+[package]
+name = "assert-instr"
+version = "0.1.0"
+authors = ["Alex Crichton <alex@alexcrichton.com>"]
+
+[dependencies]
+assert-instr-macro = { path = "assert-instr-macro" }
+backtrace = "0.3"
+cc = "1.0"
+lazy_static = "0.2"
+rustc-demangle = "0.1"
--- a/library/stdarch/assert-instr/assert-instr-macro/Cargo.toml
+++ b/library/stdarch/assert-instr/assert-instr-macro/Cargo.toml
@ -0,0 +1,7 @@
+[package]
+name = "assert-instr-macro"
+version = "0.1.0"
+authors = ["Alex Crichton <alex@alexcrichton.com>"]
+
+[lib]
+proc-macro = true
--- a/library/stdarch/assert-instr/assert-instr-macro/build.rs
+++ b/library/stdarch/assert-instr/assert-instr-macro/build.rs
@ -0,0 +1,10 @@
+use std::env;
+
+fn main() {
+    println!("cargo:rerun-if-changed=build.rs");
+    let opt_level = env::var("OPT_LEVEL").ok().and_then(|s| s.parse().ok()).unwrap_or(0);
+    let profile = env::var("PROFILE").unwrap_or(String::new());
+    if profile == "release" || opt_level >= 2 {
+        println!("cargo:rustc-cfg=optimized");
+    }
+}
--- a/library/stdarch/assert-instr/assert-instr-macro/src/lib.rs
+++ b/library/stdarch/assert-instr/assert-instr-macro/src/lib.rs
@ -0,0 +1,71 @@
+//! Implementation of the `#[assert_instr]` macro
+//!
+//! This macro is used when testing the `stdsimd` crate and is used to generate
+//! test cases to assert that functions do indeed contain the instructions that
+//! we're expecting them to contain.
+//!
+//! The procedural macro here is relatively simple, it simply appends a
+//! `#[test]` function to the original token stream which asserts that the
+//! function itself contains the relevant instruction.
+
+#![feature(proc_macro)]
+
+extern crate proc_macro;
+
+use proc_macro::{TokenStream, Term, TokenNode, Delimiter};
+
+#[proc_macro_attribute]
+pub fn assert_instr(attr: TokenStream, item: TokenStream) -> TokenStream {
+    let name = find_name(item.clone());
+    let tokens = attr.into_iter().collect::<Vec<_>>();
+    if tokens.len() != 1 {
+        panic!("expected #[assert_instr(foo)]");
+    }
+    let tokens = match tokens[0].kind {
+        TokenNode::Group(Delimiter::Parenthesis, ref rest) => rest.clone(),
+        _ => panic!("expected #[assert_instr(foo)]"),
+    };
+    let tokens = tokens.into_iter().collect::<Vec<_>>();
+    if tokens.len() != 1 {
+        panic!("expected #[assert_instr(foo)]");
+    }
+    let instr = match tokens[0].kind {
+        TokenNode::Term(term) => term,
+        _ => panic!("expected #[assert_instr(foo)]"),
+    };
+
+    let ignore = if cfg!(optimized) {
+        ""
+    } else {
+        "#[ignore]"
+    };
+    let test = format!("
+        #[test]
+        #[allow(non_snake_case)]
+        {ignore}
+        fn assert_instr_{name}() {{
+            ::assert_instr::assert({name} as usize,
+                                   \"{name}\",
+                                   \"{instr}\");
+        }}
+    ", name = name.as_str(), instr = instr.as_str(), ignore = ignore);
+    let test: TokenStream = test.parse().unwrap();
+
+    item.into_iter().chain(test.into_iter()).collect()
+}
+
+fn find_name(item: TokenStream) -> Term {
+    let mut tokens = item.into_iter();
+    while let Some(tok) = tokens.next() {
+        if let TokenNode::Term(word) = tok.kind {
+            if word.as_str() == "fn" {
+                break
+            }
+        }
+    }
+
+    match tokens.next().map(|t| t.kind) {
+        Some(TokenNode::Term(word)) => word,
+        _ => panic!("failed to find function name"),
+    }
+}
--- a/library/stdarch/assert-instr/src/lib.rs
+++ b/library/stdarch/assert-instr/src/lib.rs
@ -0,0 +1,273 @@
+//! Runtime support needed for the `#![assert_instr]` macro
+//!
+//! This basically just disassembles the current executable and then parses the
+//! output once globally and then provides the `assert` function which makes
+//! assertions about the disassembly of a function.
+
+#![feature(proc_macro)]
+
+extern crate assert_instr_macro;
+extern crate backtrace;
+extern crate cc;
+extern crate rustc_demangle;
+#[macro_use]
+extern crate lazy_static;
+
+use std::collections::HashMap;
+use std::env;
+use std::process::Command;
+use std::str;
+
+pub use assert_instr_macro::*;
+
+lazy_static! {
+    static ref DISASSEMBLY: HashMap<String, Vec<Function>> = disassemble_myself();
+}
+
+struct Function {
+    instrs: Vec<Instruction>,
+}
+
+struct Instruction {
+    parts: Vec<String>,
+}
+
+fn disassemble_myself() -> HashMap<String, Vec<Function>> {
+    let me = env::current_exe().expect("failed to get current exe");
+
+    if cfg!(target_arch = "x86_64") &&
+        cfg!(target_os = "windows") &&
+        cfg!(target_env = "msvc") {
+        let mut cmd = cc::windows_registry::find("x86_64-pc-windows-msvc", "dumpbin.exe")
+            .expect("failed to find `dumpbin` tool");
+        let output = cmd.arg("/DISASM").arg(&me).output()
+            .expect("failed to execute dumpbin");
+        println!("{}\n{}", output.status, String::from_utf8_lossy(&output.stderr));
+        assert!(output.status.success());
+        parse_dumpbin(&String::from_utf8_lossy(&output.stdout))
+    } else if cfg!(target_os = "windows") {
+        panic!("disassembly unimplemented")
+    } else if cfg!(target_os = "macos") {
+        let output = Command::new("otool")
+            .arg("-vt")
+            .arg(&me)
+            .output()
+            .expect("failed to execute otool");
+        println!("{}\n{}", output.status, String::from_utf8_lossy(&output.stderr));
+        assert!(output.status.success());
+
+        parse_otool(&str::from_utf8(&output.stdout).expect("stdout not utf8"))
+    } else {
+        let output = Command::new("objdump")
+            .arg("--disassemble")
+            .arg(&me)
+            .output()
+            .expect("failed to execute objdump");
+        println!("{}\n{}", output.status, String::from_utf8_lossy(&output.stderr));
+        assert!(output.status.success());
+
+        parse_objdump(&str::from_utf8(&output.stdout).expect("stdout not utf8"))
+    }
+}
+
+fn parse_objdump(output: &str) -> HashMap<String, Vec<Function>> {
+    let mut lines = output.lines();
+
+    for line in output.lines().take(100) {
+        println!("{}", line);
+    }
+
+    let mut ret = HashMap::new();
+    while let Some(header) = lines.next() {
+        // symbols should start with `$hex_addr <$name>:`
+        if !header.ends_with(">:") {
+            continue
+        }
+        let start = header.find("<").unwrap();
+        let symbol = &header[start + 1..header.len() - 2];
+
+        let mut instructions = Vec::new();
+        while let Some(instruction) = lines.next() {
+            if instruction.is_empty() {
+                break
+            }
+            // Each line of instructions should look like:
+            //
+            //      $rel_offset: ab cd ef 00    $instruction...
+            let parts = instruction.split_whitespace()
+                .skip(1)
+                .skip_while(|s| {
+                    s.len() == 2 && usize::from_str_radix(s, 16).is_ok()
+                })
+                .map(|s| s.to_string())
+                .collect::<Vec<String>>();
+            instructions.push(Instruction { parts });
+        }
+
+        ret.entry(normalize(symbol))
+            .or_insert(Vec::new())
+            .push(Function { instrs: instructions });
+    }
+
+    return ret
+}
+
+fn parse_otool(output: &str) -> HashMap<String, Vec<Function>> {
+    let mut lines = output.lines();
+
+    for line in output.lines().take(100) {
+        println!("{}", line);
+    }
+
+    let mut ret = HashMap::new();
+    let mut cached_header = None;
+    loop {
+        let header = match cached_header.take().or_else(|| lines.next()) {
+            Some(header) => header,
+            None => break,
+        };
+        // symbols should start with `$symbol:`
+        if !header.ends_with(":") {
+            continue
+        }
+        // strip the leading underscore and the trailing colon
+        let symbol = &header[1..header.len() - 1];
+
+        let mut instructions = Vec::new();
+        while let Some(instruction) = lines.next() {
+            if instruction.ends_with(":") {
+                cached_header = Some(instruction);
+                break
+            }
+            // Each line of instructions should look like:
+            //
+            //      $addr    $instruction...
+            let parts = instruction.split_whitespace()
+                .skip(1)
+                .map(|s| s.to_string())
+                .collect::<Vec<String>>();
+            instructions.push(Instruction { parts });
+        }
+
+        ret.entry(normalize(symbol))
+            .or_insert(Vec::new())
+            .push(Function { instrs: instructions });
+    }
+
+    return ret
+}
+
+fn parse_dumpbin(output: &str) -> HashMap<String, Vec<Function>> {
+    let mut lines = output.lines();
+
+    for line in output.lines().take(100) {
+        println!("{}", line);
+    }
+
+    let mut ret = HashMap::new();
+    let mut cached_header = None;
+    loop {
+        let header = match cached_header.take().or_else(|| lines.next()) {
+            Some(header) => header,
+            None => break,
+        };
+        // symbols should start with `$symbol:`
+        if !header.ends_with(":") {
+            continue
+        }
+        // strip the trailing colon
+        let symbol = &header[..header.len() - 1];
+
+        let mut instructions = Vec::new();
+        while let Some(instruction) = lines.next() {
+            if !instruction.starts_with("  ") {
+                cached_header = Some(instruction);
+                break
+            }
+            // Each line looks like:
+            //
+            // >  $addr: ab cd ef     $instr..
+            // >         00 12          # this line os optional
+            if instruction.starts_with("       ") {
+                continue
+            }
+            let parts = instruction.split_whitespace()
+                .skip(1)
+                .skip_while(|s| {
+                    s.len() == 2 && usize::from_str_radix(s, 16).is_ok()
+                })
+                .map(|s| s.to_string())
+                .collect::<Vec<String>>();
+            instructions.push(Instruction { parts });
+        }
+
+        ret.entry(normalize(symbol))
+            .or_insert(Vec::new())
+            .push(Function { instrs: instructions });
+    }
+
+    return ret
+}
+
+fn normalize(symbol: &str) -> String {
+    let symbol = rustc_demangle::demangle(symbol).to_string();
+    match symbol.rfind("::h") {
+        Some(i) => symbol[..i].to_string(),
+        None => symbol.to_string(),
+    }
+}
+
+/// Main entry point for this crate, called by the `#[assert_instr]` macro.
+///
+/// This asserts that the function at `fnptr` contains the instruction
+/// `expected` provided.
+pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
+    // Translate this function pointer to a symbolic name that we'd have found
+    // in the disassembly.
+    let mut sym = None;
+    backtrace::resolve(fnptr as *mut _, |name| {
+        sym = name.name().and_then(|s| s.as_str()).map(normalize);
+    });
+
+    let functions = match sym.as_ref().and_then(|s| DISASSEMBLY.get(s)) {
+        Some(s) => s,
+        None => {
+            if let Some(sym) = sym {
+                println!("assumed symbol name: `{}`", sym);
+            }
+            println!("maybe related functions");
+            for f in DISASSEMBLY.keys().filter(|k| k.contains(fnname)) {
+                println!("\t- {}", f);
+            }
+            panic!("failed to find disassembly of {:#x} ({})", fnptr, fnname);
+        }
+    };
+
+    assert_eq!(functions.len(), 1);
+    let function = &functions[0];
+
+    // Look for `expected` as the first part of any instruction in this
+    // function, returning if we do indeed find it.
+    for instr in function.instrs.iter() {
+        // Gets the first instruction, e.g. tzcntl in tzcntl %rax,%rax
+        if let Some(part) = instr.parts.get(0) {
+            // Truncates the instruction with the length of the expected
+            // instruction: tzcntl => tzcnt and compares that.
+            if part.starts_with(expected) {
+                return
+            }
+        }
+    }
+
+    // Help debug by printing out the found disassembly, and then panic as we
+    // didn't find the instruction.
+    println!("disassembly for {}: ", sym.as_ref().unwrap());
+    for (i, instr) in function.instrs.iter().enumerate() {
+        print!("\t{:2}: ", i);
+        for part in instr.parts.iter() {
+            print!("{} ", part);
+        }
+        println!("");
+    }
+    panic!("failed to find instruction `{}` in the disassembly", expected);
+}
--- a/library/stdarch/src/arm/mod.rs
+++ b/library/stdarch/src/arm/mod.rs
@ -0,0 +1,10 @@
+//! ARM intrinsics.
+pub use self::v6::*;
+pub use self::v7::*;
+#[cfg(target_arch = "aarch64")]
+pub use self::v8::*;
+
+mod v6;
+mod v7;
+#[cfg(target_arch = "aarch64")]
+mod v8;
--- a/library/stdarch/src/arm/v6.rs
+++ b/library/stdarch/src/arm/v6.rs
@ -0,0 +1,25 @@
+//! ARMv6 intrinsics.
+//!
+//! The reference is [ARMv6-M Architecture Reference
+//! Manual](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0419c/index.html).
+
+/// Reverse the order of the bytes.
+#[inline(always)]
+#[cfg_attr(test, assert_instr(rev))]
+pub fn _rev_u8(x: u8) -> u8 {
+    x.swap_bytes() as u8
+}
+
+/// Reverse the order of the bytes.
+#[inline(always)]
+#[cfg_attr(test, assert_instr(rev))]
+pub fn _rev_u16(x: u16) -> u16 {
+    x.swap_bytes() as u16
+}
+
+/// Reverse the order of the bytes.
+#[inline(always)]
+#[cfg_attr(test, assert_instr(rev))]
+pub fn _rev_u32(x: u32) -> u32 {
+    x.swap_bytes() as u32
+}
--- a/library/stdarch/src/arm/v7.rs
+++ b/library/stdarch/src/arm/v7.rs
@ -0,0 +1,40 @@
+//! ARMv7 intrinsics.
+//!
+//! The reference is [ARMv7-M Architecture Reference Manual (Issue
+//! E.b)](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0403e.b/index.html).
+
+pub use super::v6::*;
+
+/// Count Leading Zeros.
+#[inline(always)]
+#[cfg_attr(test, assert_instr(clz))]
+pub fn _clz_u8(x: u8) -> u8 {
+    x.leading_zeros() as u8
+}
+
+/// Count Leading Zeros.
+#[inline(always)]
+#[cfg_attr(test, assert_instr(clz))]
+pub fn _clz_u16(x: u16) -> u16 {
+    x.leading_zeros() as u16
+}
+
+/// Count Leading Zeros.
+#[inline(always)]
+#[cfg_attr(test, assert_instr(clz))]
+pub fn _clz_u32(x: u32) -> u32 {
+    x.leading_zeros() as u32
+}
+
+#[allow(dead_code)]
+extern "C" {
+    #[link_name="llvm.bitreverse.i32"]
+    fn rbit_u32(i: i32) -> i32;
+}
+
+/// Reverse the bit order.
+#[inline(always)]
+#[cfg_attr(test, assert_instr(rbit))]
+pub fn _rbit_u32(x: u32) -> u32 {
+    unsafe { rbit_u32(x as i32) as u32 }
+}
--- a/library/stdarch/src/arm/v8.rs
+++ b/library/stdarch/src/arm/v8.rs
@ -0,0 +1,54 @@
+//! ARMv8 intrinsics.
+//!
+//! The reference is [ARMv8-A Reference Manual](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0487a.k_10775/index.html).
+
+pub use super::v7::*;
+
+/// Reverse the order of the bytes.
+#[inline(always)]
+#[cfg_attr(test, assert_instr(rev))]
+pub fn _rev_u64(x: u64) -> u64 {
+    x.swap_bytes() as u64
+}
+
+/// Count Leading Zeros.
+#[inline(always)]
+#[cfg_attr(test, assert_instr(clz))]
+pub fn _clz_u64(x: u64) -> u64 {
+    x.leading_zeros() as u64
+}
+
+#[allow(dead_code)]
+extern "C" {
+    #[link_name="llvm.bitreverse.i64"]
+    fn rbit_u64(i: i64) -> i64;
+}
+
+/// Reverse the bit order.
+#[inline(always)]
+#[cfg_attr(test, assert_instr(rbit))]
+pub fn _rbit_u64(x: u64) -> u64 {
+    unsafe { rbit_u64(x as i64) as u64 }
+}
+
+/// Counts the leading most significant bits set.
+///
+/// When all bits of the operand are set it returns the size of the operand in
+/// bits.
+#[inline(always)]
+// LLVM Bug (should be cls): https://bugs.llvm.org/show_bug.cgi?id=31802
+#[cfg_attr(test, assert_instr(clz))] 
+pub fn _cls_u32(x: u32) -> u32 {
+    u32::leading_zeros(!x) as u32
+}
+
+/// Counts the leading most significant bits set.
+///
+/// When all bits of the operand are set it returns the size of the operand in
+/// bits.
+#[inline(always)]
+// LLVM Bug (should be cls): https://bugs.llvm.org/show_bug.cgi?id=31802
+#[cfg_attr(test, assert_instr(clz))] 
+pub fn _cls_u64(x: u64) -> u64 {
+    u64::leading_zeros(!x) as u64
+}
--- a/library/stdarch/src/lib.rs
+++ b/library/stdarch/src/lib.rs
@ -1,8 +1,12 @@
 #![allow(dead_code)]
 #![feature(
    const_fn, link_llvm_intrinsics, platform_intrinsics, repr_simd, simd_ffi,
-    target_feature,
+    target_feature, cfg_target_feature, i128_type
 )]
+#![cfg_attr(test, feature(proc_macro))]
+
+#[cfg(test)]
+extern crate assert_instr;

 /// Platform independent SIMD vector types and operations.
 pub mod simd {
@ -16,6 +20,9 @@ pub mod simd {
 pub mod vendor {
    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
    pub use x86::*;
+
+    #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
+    pub use arm::*;
 }

 #[macro_use]
@ -27,3 +34,6 @@ mod v512;
 mod v64;
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 mod x86;
+
+#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
+ mod arm;
--- a/library/stdarch/src/x86/abm.rs
+++ b/library/stdarch/src/x86/abm.rs
@ -0,0 +1,71 @@
+//! Advanced Bit Manipulation (ABM) instructions
+//!
+//! The POPCNT and LZCNT have their own CPUID bits to indicate support.
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2: Instruction Set Reference, A-Z](http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf).
+//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and System Instructions](http://support.amd.com/TechDocs/24594.pdf).
+//!
+//! [Wikipedia](https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29)
+//! provides a quick overview of the instructions available.
+
+#[cfg(test)]
+use assert_instr::assert_instr;
+
+/// Counts the leading most significant zero bits.
+///
+/// When the operand is zero, it returns its size in bits.
+#[inline(always)]
+#[target_feature = "+lzcnt"]
+#[cfg_attr(test, assert_instr(lzcnt))]
+pub fn _lzcnt_u32(x: u32) -> u32 { x.leading_zeros() }
+
+/// Counts the leading most significant zero bits.
+///
+/// When the operand is zero, it returns its size in bits.
+#[inline(always)]
+#[target_feature = "+lzcnt"]
+#[cfg_attr(test, assert_instr(lzcnt))]
+pub fn _lzcnt_u64(x: u64) -> u64 { x.leading_zeros() as u64 }
+
+/// Counts the bits that are set.
+#[inline(always)]
+#[target_feature = "+popcnt"]
+#[cfg_attr(test, assert_instr(popcnt))]
+pub fn _popcnt32(x: u32) -> u32 { x.count_ones() }
+
+/// Counts the bits that are set.
+#[inline(always)]
+#[target_feature = "+popcnt"]
+#[cfg_attr(test, assert_instr(popcnt))]
+pub fn _popcnt64(x: u64) -> u64 { x.count_ones() as u64 }
+
+#[cfg(all(test, target_feature = "bmi", any(target_arch = "x86", target_arch = "x86_64")))]
+mod tests {
+    use x86::abm;
+
+    #[test]
+    #[target_feature = "+lzcnt"]
+    fn _lzcnt_u32() {
+        assert_eq!(abm::_lzcnt_u32(0b0101_1010u32), 25u32);
+    }
+
+    #[test]
+    #[target_feature = "+lzcnt"]
+    fn _lzcnt_u64() {
+        assert_eq!(abm::_lzcnt_u64(0b0101_1010u64), 57u64);
+    }
+
+    #[test]
+    #[target_feature = "+popcnt"]
+    fn _popcnt32() {
+        assert_eq!(abm::_popcnt32(0b0101_1010u32), 4);
+    }
+
+    #[test]
+    #[target_feature = "+popcnt"]
+    fn _popcnt64() {
+        assert_eq!(abm::_popcnt64(0b0101_1010u64), 4);
+    }
+}
--- a/library/stdarch/src/x86/avx.rs
+++ b/library/stdarch/src/x86/avx.rs
@ -31,7 +31,7 @@ extern "C" {
 }


-#[cfg(test)]
+#[cfg(all(test, target_feature = "avx", any(target_arch = "x86", target_arch = "x86_64")))]
 mod tests {
    use v256::*;
    use x86::avx;
@ -65,7 +65,4 @@ mod tests {
        let e = f64x4::new(-4.0,8.0,-4.0,12.0);
        assert_eq!(r, e);
    }
-
-
-
-}
+}
--- a/library/stdarch/src/x86/avx2.rs
+++ b/library/stdarch/src/x86/avx2.rs
@ -1044,7 +1044,7 @@ extern "C" {
 }


-#[cfg(test)]
+#[cfg(all(test, target_feature = "avx2", any(target_arch = "x86", target_arch = "x86_64")))]
 mod tests {
    use v256::*;
    use v128::*;
--- a/library/stdarch/src/x86/bmi.rs
+++ b/library/stdarch/src/x86/bmi.rs
@ -0,0 +1,288 @@
+//! Bit Manipulation Instruction (BMI) Set 1.0.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference,
+//! A-Z](http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf).
+//!
+//! [Wikipedia](https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#BMI1_.28Bit_Manipulation_Instruction_Set_1.29)
+//! provides a quick overview of the available instructions.
+
+#[cfg(test)]
+use assert_instr::assert_instr;
+
+#[allow(dead_code)]
+extern "C" {
+    #[link_name="llvm.x86.bmi.bextr.32"]
+    fn x86_bmi_bextr_32(x: u32, y: u32) -> u32;
+    #[link_name="llvm.x86.bmi.bextr.64"]
+    fn x86_bmi_bextr_64(x: u64, y: u64) -> u64;
+}
+
+/// Extracts bits in range [`start`, `start` + `length`) from `a` into
+/// the least significant bits of the result.
+#[inline(always)]
+#[target_feature = "+bmi"]
+#[cfg_attr(test, assert_instr(bextr))]
+pub fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
+    _bextr2_u32(a, (start & 0xffu32) | ((len & 0xffu32) << 8u32))
+}
+
+/// Extracts bits in range [`start`, `start` + `length`) from `a` into
+/// the least significant bits of the result.
+#[inline(always)]
+#[target_feature = "+bmi"]
+#[cfg_attr(test, assert_instr(bextr))]
+pub fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 {
+    _bextr2_u64(a, (start & 0xffu64) | ((len & 0xffu64) << 8u64))
+}
+
+/// Extracts bits of `a` specified by `control` into
+/// the least significant bits of the result.
+///
+/// Bits [7,0] of `control` specify the index to the first bit in the range to be
+/// extracted, and bits [15,8] specify the length of the range.
+#[inline(always)]
+#[target_feature = "+bmi"]
+#[cfg_attr(test, assert_instr(bextr))]
+pub fn _bextr2_u32(a: u32, control: u32) -> u32 {
+    unsafe { x86_bmi_bextr_32(a, control) }
+}
+
+/// Extracts bits of `a` specified by `control` into
+/// the least significant bits of the result.
+///
+/// Bits [7,0] of `control` specify the index to the first bit in the range to be
+/// extracted, and bits [15,8] specify the length of the range.
+#[inline(always)]
+#[target_feature = "+bmi"]
+#[cfg_attr(test, assert_instr(bextr))]
+pub fn _bextr2_u64(a: u64, control: u64) -> u64 {
+    unsafe { x86_bmi_bextr_64(a, control) }
+}
+
+/// Bitwise logical `AND` of inverted `a` with `b`.
+#[inline(always)]
+#[target_feature = "+bmi"]
+#[cfg_attr(test, assert_instr(andn))]
+pub fn _andn_u32(a: u32, b: u32) -> u32 {
+    !a & b
+}
+
+/// Bitwise logical `AND` of inverted `a` with `b`.
+#[inline(always)]
+#[target_feature = "+bmi"]
+#[cfg_attr(test, assert_instr(andn))]
+pub fn _andn_u64(a: u64, b: u64) -> u64 {
+    !a & b
+}
+
+/// Extract lowest set isolated bit.
+#[inline(always)]
+#[target_feature = "+bmi"]
+#[cfg_attr(test, assert_instr(blsi))]
+pub fn _blsi_u32(x: u32) -> u32 {
+    x & x.wrapping_neg()
+}
+
+/// Extract lowest set isolated bit.
+#[inline(always)]
+#[target_feature = "+bmi"]
+#[cfg_attr(test, assert_instr(blsi))]
+pub fn _blsi_u64(x: u64) -> u64 {
+    x & x.wrapping_neg()
+}
+
+/// Get mask up to lowest set bit.
+#[inline(always)]
+#[target_feature = "+bmi"]
+#[cfg_attr(test, assert_instr(blsmsk))]
+pub fn _blsmsk_u32(x: u32) -> u32 {
+    x ^ (x.wrapping_sub(1u32))
+}
+
+/// Get mask up to lowest set bit.
+#[inline(always)]
+#[target_feature = "+bmi"]
+#[cfg_attr(test, assert_instr(blsmsk))]
+pub fn _blsmsk_u64(x: u64) -> u64 {
+    x ^ (x.wrapping_sub(1u64))
+}
+
+/// Resets the lowest set bit of `x`.
+///
+/// If `x` is sets CF.
+#[inline(always)]
+#[target_feature = "+bmi"]
+#[cfg_attr(test, assert_instr(blsr))]
+pub fn _blsr_u32(x: u32) -> u32 {
+    x & (x.wrapping_sub(1))
+}
+
+/// Resets the lowest set bit of `x`.
+///
+/// If `x` is sets CF.
+#[inline(always)]
+#[target_feature = "+bmi"]
+#[cfg_attr(test, assert_instr(blsr))]
+pub fn _blsr_u64(x: u64) -> u64 {
+    x & (x.wrapping_sub(1))
+}
+
+/// Counts the number of trailing least significant zero bits.
+///
+/// When the source operand is 0, it returns its size in bits.
+#[inline(always)]
+#[target_feature = "+bmi"]
+#[cfg_attr(test, assert_instr(tzcnt))]
+pub fn _tzcnt_u16(x: u16) -> u16 {
+    x.trailing_zeros() as u16
+}
+
+/// Counts the number of trailing least significant zero bits.
+///
+/// When the source operand is 0, it returns its size in bits.
+#[inline(always)]
+#[target_feature = "+bmi"]
+#[cfg_attr(test, assert_instr(tzcnt))]
+pub fn _tzcnt_u32(x: u32) -> u32 {
+    x.trailing_zeros()
+}
+
+/// Counts the number of trailing least significant zero bits.
+///
+/// When the source operand is 0, it returns its size in bits.
+#[inline(always)]
+#[target_feature = "+bmi"]
+#[cfg_attr(test, assert_instr(tzcnt))]
+pub fn _tzcnt_u64(x: u64) -> u64 {
+    x.trailing_zeros() as u64
+}
+
+/// Counts the number of trailing least significant zero bits.
+///
+/// When the source operand is 0, it returns its size in bits.
+#[inline(always)]
+#[target_feature = "+bmi"]
+#[cfg_attr(test, assert_instr(tzcnt))]
+pub fn _mm_tzcnt_u32(x: u32) -> u32 {
+    x.trailing_zeros()
+}
+
+/// Counts the number of trailing least significant zero bits.
+///
+/// When the source operand is 0, it returns its size in bits.
+#[inline(always)]
+#[target_feature = "+bmi"]
+#[cfg_attr(test, assert_instr(tzcnt))]
+pub fn _mm_tzcnt_u64(x: u64) -> u64 {
+    x.trailing_zeros() as u64
+}
+
+#[cfg(all(test, target_feature = "bmi", any(target_arch = "x86", target_arch = "x86_64")))]
+mod tests {
+    use x86::bmi;
+
+    #[test]
+    #[target_feature = "+bmi"]
+    fn _bextr_u32() {
+        assert_eq!(bmi::_bextr_u32(0b0101_0000u32, 4, 4), 0b0000_0101u32);
+    }
+
+    #[test]
+    #[target_feature = "+bmi"]
+    fn _bextr_u64() {
+        assert_eq!(bmi::_bextr_u64(0b0101_0000u64, 4, 4), 0b0000_0101u64);
+    }
+
+    #[test]
+    #[target_feature = "+bmi"]
+    fn _andn_u32() {
+        assert_eq!(bmi::_andn_u32(0, 0), 0);
+        assert_eq!(bmi::_andn_u32(0, 1), 1);
+        assert_eq!(bmi::_andn_u32(1, 0), 0);
+        assert_eq!(bmi::_andn_u32(1, 1), 0);
+
+        assert_eq!(bmi::_andn_u32(0b0000_0000u32, 0b0000_0000u32), 0b0000_0000u32);
+        assert_eq!(bmi::_andn_u32(0b0000_0000u32, 0b1111_1111u32), 0b1111_1111u32);
+        assert_eq!(bmi::_andn_u32(0b1111_1111u32, 0b0000_0000u32), 0b0000_0000u32);
+        assert_eq!(bmi::_andn_u32(0b1111_1111u32, 0b1111_1111u32), 0b0000_0000u32);
+        assert_eq!(bmi::_andn_u32(0b0100_0000u32, 0b0101_1101u32), 0b0001_1101u32);
+    }
+
+    #[test]
+    #[target_feature = "+bmi"]
+    fn _andn_u64() {
+        assert_eq!(bmi::_andn_u64(0, 0), 0);
+        assert_eq!(bmi::_andn_u64(0, 1), 1);
+        assert_eq!(bmi::_andn_u64(1, 0), 0);
+        assert_eq!(bmi::_andn_u64(1, 1), 0);
+
+        assert_eq!(bmi::_andn_u64(0b0000_0000u64, 0b0000_0000u64), 0b0000_0000u64);
+        assert_eq!(bmi::_andn_u64(0b0000_0000u64, 0b1111_1111u64), 0b1111_1111u64);
+        assert_eq!(bmi::_andn_u64(0b1111_1111u64, 0b0000_0000u64), 0b0000_0000u64);
+        assert_eq!(bmi::_andn_u64(0b1111_1111u64, 0b1111_1111u64), 0b0000_0000u64);
+        assert_eq!(bmi::_andn_u64(0b0100_0000u64, 0b0101_1101u64), 0b0001_1101u64);
+    }
+
+    #[test]
+    #[target_feature = "+bmi"]
+    fn _blsi_u32() {
+        assert_eq!(bmi::_blsi_u32(0b1101_0000u32), 0b0001_0000u32);
+    }
+
+    #[test]
+    #[target_feature = "+bmi"]
+    fn _blsi_u64() {
+        assert_eq!(bmi::_blsi_u64(0b1101_0000u64), 0b0001_0000u64);
+    }
+
+    #[test]
+    #[target_feature = "+bmi"]
+    fn _blsmsk_u32() {
+        assert_eq!(bmi::_blsmsk_u32(0b0011_0000u32), 0b0001_1111u32);
+    }
+
+    #[test]
+    #[target_feature = "+bmi"]
+    fn _blsmsk_u64() {
+        assert_eq!(bmi::_blsmsk_u64(0b0011_0000u64), 0b0001_1111u64);
+    }
+
+    #[test]
+    #[target_feature = "+bmi"]
+    fn _blsr_u32() {
+        /// TODO: test the behavior when the input is 0
+        assert_eq!(bmi::_blsr_u32(0b0011_0000u32), 0b0010_0000u32);
+    }
+
+    #[test]
+    #[target_feature = "+bmi"]
+    fn _blsr_u64() {
+        /// TODO: test the behavior when the input is 0
+        assert_eq!(bmi::_blsr_u64(0b0011_0000u64), 0b0010_0000u64);
+    }
+
+    #[test]
+    #[target_feature = "+bmi"]
+    fn _tzcnt_u16() {
+        assert_eq!(bmi::_tzcnt_u16(0b0000_0001u16), 0u16);
+        assert_eq!(bmi::_tzcnt_u16(0b0000_0000u16), 16u16);
+        assert_eq!(bmi::_tzcnt_u16(0b1001_0000u16), 4u16);
+    }
+
+    #[test]
+    #[target_feature = "+bmi"]
+    fn _tzcnt_u32() {
+        assert_eq!(bmi::_tzcnt_u32(0b0000_0001u32), 0u32);
+        assert_eq!(bmi::_tzcnt_u32(0b0000_0000u32), 32u32);
+        assert_eq!(bmi::_tzcnt_u32(0b1001_0000u32), 4u32);
+    }
+
+    #[test]
+    #[target_feature = "+bmi"]
+    fn _tzcnt_u64() {
+        assert_eq!(bmi::_tzcnt_u64(0b0000_0001u64), 0u64);
+        assert_eq!(bmi::_tzcnt_u64(0b0000_0000u64), 64u64);
+        assert_eq!(bmi::_tzcnt_u64(0b1001_0000u64), 4u64);
+    }
+}
--- a/library/stdarch/src/x86/bmi2.rs
+++ b/library/stdarch/src/x86/bmi2.rs
@ -0,0 +1,215 @@
+//! Bit Manipulation Instruction (BMI) Set 2.0.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference,
+//! A-Z](http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectu res-software-developer-instruction-set-reference-manual-325383.pdf).
+//!
+//! [Wikipedia](https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#BMI2_.28Bit_Manipulation_Instruction_Set_2.29)
+//! provides a quick overview of the available instructions.
+
+#[cfg(test)]
+use assert_instr::assert_instr;
+
+/// Unsigned multiply without affecting flags.
+///
+/// Unsigned multiplication of `a` with `b` returning a pair `(lo, hi)` with
+/// the low half and the high half of the result.
+#[inline(always)]
+// LLVM BUG (should be mulxl): https://bugs.llvm.org/show_bug.cgi?id=34232
+#[cfg_attr(test, assert_instr(imul))]
+#[target_feature = "+bmi2"]
+pub fn _mulx_u32(a: u32, b: u32) -> (u32, u32) {
+    let result: u64 = (a as u64) * (b as u64);
+    let hi = (result >> 32) as u32;
+    (result as u32, hi)
+}
+
+/// Unsigned multiply without affecting flags.
+///
+/// Unsigned multiplication of `a` with `b` returning a pair `(lo, hi)` with
+/// the low half and the high half of the result.
+#[inline(always)]
+#[cfg_attr(test, assert_instr(mulx))]
+#[target_feature = "+bmi2"]
+pub fn _mulx_u64(a: u64, b: u64) -> (u64, u64) {
+    let result: u128 = (a as u128) * (b as u128);
+    let hi = (result >> 64) as u64;
+    (result as u64, hi)
+}
+
+#[allow(dead_code)]
+extern "C" {
+    #[link_name="llvm.x86.bmi.bzhi.32"]
+    fn x86_bmi2_bzhi_32(x: u32, y: u32) -> u32;
+    #[link_name="llvm.x86.bmi.bzhi.64"]
+    fn x86_bmi2_bzhi_64(x: u64, y: u64) -> u64;
+    #[link_name="llvm.x86.bmi.pdep.32"]
+    fn x86_bmi2_pdep_32(x: u32, y: u32) -> u32;
+    #[link_name="llvm.x86.bmi.pdep.64"]
+    fn x86_bmi2_pdep_64(x: u64, y: u64) -> u64;
+    #[link_name="llvm.x86.bmi.pext.32"]
+    fn x86_bmi2_pext_32(x: u32, y: u32) -> u32;
+    #[link_name="llvm.x86.bmi.pext.64"]
+    fn x86_bmi2_pext_64(x: u64, y: u64) -> u64;
+}
+
+
+/// Zero higher bits of `a` >= `index`.
+#[inline(always)]
+#[target_feature = "+bmi2"]
+#[cfg_attr(test, assert_instr(bzhi))]
+pub fn _bzhi_u32(a: u32, index: u32) -> u32 {
+    unsafe { x86_bmi2_bzhi_32(a, index) }
+}
+
+/// Zero higher bits of `a` >= `index`.
+#[inline(always)]
+#[target_feature = "+bmi2"]
+#[cfg_attr(test, assert_instr(bzhi))]
+pub fn _bzhi_u64(a: u64, index: u64) -> u64 {
+    unsafe { x86_bmi2_bzhi_64(a, index) }
+}
+
+
+/// Scatter contiguous low order bits of `a` to the result at the positions
+/// specified by the `mask`.
+#[inline(always)]
+#[target_feature = "+bmi2"]
+#[cfg_attr(test, assert_instr(pdep))]
+pub fn _pdep_u32(a: u32, mask: u32) -> u32 {
+    unsafe { x86_bmi2_pdep_32(a, mask) }
+}
+
+/// Scatter contiguous low order bits of `a` to the result at the positions
+/// specified by the `mask`.
+#[inline(always)]
+#[target_feature = "+bmi2"]
+#[cfg_attr(test, assert_instr(pdep))]
+pub fn _pdep_u64(a: u64, mask: u64) -> u64 {
+    unsafe { x86_bmi2_pdep_64(a, mask) }
+}
+
+/// Gathers the bits of `x` specified by the `mask` into the contiguous low
+/// order bit positions of the result.
+#[inline(always)]
+#[target_feature = "+bmi2"]
+#[cfg_attr(test, assert_instr(pext))]
+pub fn _pext_u32(a: u32, mask: u32) -> u32 {
+    unsafe { x86_bmi2_pext_32(a, mask) }
+}
+
+/// Gathers the bits of `x` specified by the `mask` into the contiguous low
+/// order bit positions of the result.
+#[inline(always)]
+#[target_feature = "+bmi2"]
+#[cfg_attr(test, assert_instr(pext))]
+pub fn _pext_u64(a: u64, mask: u64) -> u64 {
+    unsafe { x86_bmi2_pext_64(a, mask) }
+}
+
+#[cfg(all(test, target_feature = "bmi2", any(target_arch = "x86", target_arch = "x86_64")))]
+mod tests {
+    use x86::bmi2;
+
+    #[test]
+    #[target_feature = "+bmi2"]
+    fn _pext_u32() {
+        let n  = 0b1011_1110_1001_0011u32;
+
+        let m0 = 0b0110_0011_1000_0101u32;
+        let s0 = 0b0000_0000_0011_0101u32;
+
+        let m1 = 0b1110_1011_1110_1111u32;
+        let s1 = 0b0001_0111_0100_0011u32;
+
+        assert_eq!(bmi2::_pext_u32(n, m0), s0);
+        assert_eq!(bmi2::_pext_u32(n, m1), s1);
+    }
+
+    #[test]
+    #[target_feature = "+bmi2"]
+    fn _pext_u64() {
+        let n  = 0b1011_1110_1001_0011u64;
+
+        let m0 = 0b0110_0011_1000_0101u64;
+        let s0 = 0b0000_0000_0011_0101u64;
+
+        let m1 = 0b1110_1011_1110_1111u64;
+        let s1 = 0b0001_0111_0100_0011u64;
+
+        assert_eq!(bmi2::_pext_u64(n, m0), s0);
+        assert_eq!(bmi2::_pext_u64(n, m1), s1);
+    }
+
+    #[test]
+    #[target_feature = "+bmi2"]
+    fn _pdep_u32() {
+        let n  = 0b1011_1110_1001_0011u32;
+
+        let m0 = 0b0110_0011_1000_0101u32;
+        let s0 = 0b0000_0010_0000_0101u32;
+
+        let m1 = 0b1110_1011_1110_1111u32;
+        let s1 = 0b1110_1001_0010_0011u32;
+
+        assert_eq!(bmi2::_pdep_u32(n, m0), s0);
+        assert_eq!(bmi2::_pdep_u32(n, m1), s1);
+    }
+
+    #[test]
+    #[target_feature = "+bmi2"]
+    fn _pdep_u64() {
+        let n  = 0b1011_1110_1001_0011u64;
+
+        let m0 = 0b0110_0011_1000_0101u64;
+        let s0 = 0b0000_0010_0000_0101u64;
+
+        let m1 = 0b1110_1011_1110_1111u64;
+        let s1 = 0b1110_1001_0010_0011u64;
+
+        assert_eq!(bmi2::_pdep_u64(n, m0), s0);
+        assert_eq!(bmi2::_pdep_u64(n, m1), s1);
+    }
+
+    #[test]
+    #[target_feature = "+bmi2"]
+    fn _bzhi_u32() {
+        let n = 0b1111_0010u32;
+        let s = 0b0001_0010u32;
+        assert_eq!(bmi2::_bzhi_u32(n, 5), s);
+    }
+
+    #[test]
+    #[target_feature = "+bmi2"]
+    fn _bzhi_u64() {
+        let n = 0b1111_0010u64;
+        let s = 0b0001_0010u64;
+        assert_eq!(bmi2::_bzhi_u64(n, 5), s);
+    }
+
+    #[test]
+    #[target_feature = "+bmi2"]
+    fn _mulx_u32() {
+        let a: u32 = 4_294_967_200;
+        let b: u32 = 2;
+        let (lo, hi): (u32, u32)  = bmi2::_mulx_u32(a, b);
+        // result = 8589934400
+        //        = 0b0001_1111_1111_1111_1111_1111_1111_0100_0000u64
+        //            ^~hi ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        assert_eq!(lo, 0b1111_1111_1111_1111_1111_1111_0100_0000u32);
+        assert_eq!(hi, 0b0001u32);
+    }
+
+    #[test]
+    #[target_feature = "+bmi2"]
+    fn _mulx_u64() {
+        let a: u64 = 9_223_372_036_854_775_800;
+        let b: u64 = 100;
+        let (lo, hi): (u64, u64)  = bmi2::_mulx_u64(a, b);
+        // result = 922337203685477580000
+        //        = 0b00110001_11111111_11111111_11111111_11111111_11111111_11111111_11111100_11100000u128
+        //            ^~hi~~~~ ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        assert_eq!(lo, 0b11111111_11111111_11111111_11111111_11111111_11111111_11111100_11100000u64);
+        assert_eq!(hi, 0b00110001u64);
+    }
+}
--- a/library/stdarch/src/x86/mod.rs
+++ b/library/stdarch/src/x86/mod.rs
@ -6,6 +6,11 @@ pub use self::sse42::*;
 pub use self::avx::*;
 pub use self::avx2::*;

+pub use self::abm::*;
+pub use self::bmi::*;
+pub use self::bmi2::*;
+pub use self::tbm::*;
+
 #[allow(non_camel_case_types)]
 pub type __m128i = ::v128::i8x16;
 #[allow(non_camel_case_types)]
@ -20,3 +25,8 @@ mod sse41;
 mod sse42;
 mod avx;
 mod avx2;
+
+mod abm;
+mod bmi;
+mod bmi2;
+mod tbm;
--- a/library/stdarch/src/x86/sse.rs
+++ b/library/stdarch/src/x86/sse.rs
@ -1,9 +1,14 @@
+use simd_llvm::simd_shuffle4;
 use v128::*;

+#[cfg(test)]
+use assert_instr::assert_instr;
+
 /// Return the square root of packed single-precision (32-bit) floating-point
 /// elements in `a`.
 #[inline(always)]
 #[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(sqrtps))]
 pub fn _mm_sqrt_ps(a: f32x4) -> f32x4 {
    unsafe { sqrtps(a) }
 }
@ -12,6 +17,7 @@ pub fn _mm_sqrt_ps(a: f32x4) -> f32x4 {
 /// floating-point elements in `a`.
 #[inline(always)]
 #[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(rcpps))]
 pub fn _mm_rcp_ps(a: f32x4) -> f32x4 {
    unsafe { rcpps(a) }
 }
@ -20,6 +26,7 @@ pub fn _mm_rcp_ps(a: f32x4) -> f32x4 {
 /// (32-bit) floating-point elements in `a`.
 #[inline(always)]
 #[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(rsqrtps))]
 pub fn _mm_rsqrt_ps(a: f32x4) -> f32x4 {
    unsafe { rsqrtps(a) }
 }
@ -28,6 +35,7 @@ pub fn _mm_rsqrt_ps(a: f32x4) -> f32x4 {
 /// `b`, and return the corresponding minimum values.
 #[inline(always)]
 #[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(minps))]
 pub fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 {
    unsafe { minps(a, b) }
 }
@ -36,16 +44,26 @@ pub fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 {
 /// `b`, and return the corresponding maximum values.
 #[inline(always)]
 #[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(maxps))]
 pub fn _mm_max_ps(a: f32x4, b: f32x4) -> f32x4 {
    unsafe { maxps(a, b) }
 }

+/// Unpack and interleave single-precision (32-bit) floating-point elements
+/// from the high half of `a` and `b`;
+#[inline(always)]
+#[target_feature = "+sse"]
+pub fn _mm_unpackhi_ps(a: f32x4, b: f32x4) -> f32x4 {
+    unsafe { simd_shuffle4(a, b, [2, 6, 3, 7]) }
+}
+
 /// Return a mask of the most significant bit of each element in `a`.
 ///
 /// The mask is stored in the 4 least significant bits of the return value.
 /// All other bits are set to `0`.
 #[inline(always)]
 #[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(movmskps))]
 pub fn _mm_movemask_ps(a: f32x4) -> i32 {
    unsafe { movmskps(a) }
 }
@ -66,7 +84,7 @@ extern {
    fn movmskps(a: f32x4) -> i32;
 }

-#[cfg(test)]
+#[cfg(all(test, target_feature = "sse", any(target_arch = "x86", target_arch = "x86_64")))]
 mod tests {
    use v128::*;
    use x86::sse;
@ -116,6 +134,15 @@ mod tests {
        assert_eq!(r, f32x4::new(-1.0, 20.0, 0.0, -5.0));
    }

+    #[test]
+    #[target_feature = "+sse"]
+    fn _mm_unpackhi_ps() {
+        let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
+        let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
+        let r = sse::_mm_unpackhi_ps(a, b);
+        assert_eq!(r, f32x4::new(3.0, 7.0, 4.0, 8.0));
+    }
+
    #[test]
    #[target_feature = "+sse"]
    fn _mm_movemask_ps() {
--- a/library/stdarch/src/x86/sse2.rs
+++ b/library/stdarch/src/x86/sse2.rs
@ -9,6 +9,9 @@ use x86::__m128i;
 use v128::*;
 use v64::*;

+#[cfg(test)]
+use assert_instr::assert_instr;
+
 /// Provide a hint to the processor that the code sequence is a spin-wait loop.
 ///
 /// This can help improve the performance and power consumption of spin-wait
@ -89,6 +92,7 @@ pub fn _mm_adds_epi8(a: i8x16, b: i8x16) -> i8x16 {
 /// Add packed 16-bit integers in `a` and `b` using saturation.
 #[inline(always)]
 #[target_feature = "+sse2"]
+#[cfg_attr(test, assert_instr(paddsw))]
 pub fn _mm_adds_epi16(a: i16x8, b: i16x8) -> i16x8 {
    unsafe { paddsw(a, b) }
 }
@ -1716,7 +1720,7 @@ extern {
    fn movmskpd(a: f64x2) -> i32;
 }

-#[cfg(test)]
+#[cfg(all(test, target_feature = "sse2", any(target_arch = "x86", target_arch = "x86_64")))]
 mod tests {
    use std::os::raw::c_void;

--- a/library/stdarch/src/x86/sse41.rs
+++ b/library/stdarch/src/x86/sse41.rs
@ -57,7 +57,7 @@ extern {
    fn dpps(a: f32x4, b: f32x4, imm8: u8) -> f32x4;
 }

-#[cfg(test)]
+#[cfg(all(test, target_feature = "sse4.1", any(target_arch = "x86", target_arch = "x86_64")))]
 mod tests {
    use v128::*;
    use x86::sse41;
--- a/library/stdarch/src/x86/sse42.rs
+++ b/library/stdarch/src/x86/sse42.rs
@ -40,7 +40,7 @@ extern {
    fn pcmpestri128(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8) -> i32;
 }

-#[cfg(test)]
+#[cfg(all(test, target_feature = "sse4.2", any(target_arch = "x86", target_arch = "x86_64")))]
 mod tests {
    use v128::*;
    use x86::{__m128i, sse42};
--- a/library/stdarch/src/x86/ssse3.rs
+++ b/library/stdarch/src/x86/ssse3.rs
@ -50,7 +50,7 @@ extern {
    fn pshufb128(a: u8x16, b: u8x16) -> u8x16;
 }

-#[cfg(test)]
+#[cfg(all(test, target_feature = "ssse3", any(target_arch = "x86", target_arch = "x86_64")))]
 mod tests {
    use v128::*;
    use x86::ssse3 as ssse3;
--- a/library/stdarch/src/x86/tbm.rs
+++ b/library/stdarch/src/x86/tbm.rs
@ -0,0 +1,393 @@
+//! Trailing Bit Manipulation (TBM) instruction set.
+//!
+//! The reference is [AMD64 Architecture Programmer's Manual, Volume 3:
+//! General-Purpose and System
+//! Instructions](http://support.amd.com/TechDocs/24594.pdf).
+//!
+//! [Wikipedia](https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#TBM_.28Trailing_Bit_Manipulation.29)
+//! provides a quick overview of the available instructions.
+
+#[cfg(test)]
+use assert_instr::assert_instr;
+
+// TODO: LLVM-CODEGEN ERROR: LLVM ERROR: Cannot select: intrinsic %llvm.x86.tbm.bextri.u32
+/*
+#[allow(dead_code)]
+extern "C" {
+    #[link_name="llvm.x86.tbm.bextri.u32"]
+    fn x86_tbm_bextri_u32(a: u32, y: u32) -> u32;
+    #[link_name="llvm.x86.tbm.bextri.u64"]
+    fn x86_tbm_bextri_u64(x: u64, y: u64) -> u64;
+}
+
+/// Extracts bits in range [`start`, `start` + `length`) from `a` into
+/// the least significant bits of the result.
+#[inline(always)]
+#[target_feature = "+tbm"]
+pub fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
+    _bextr2_u32(a, (start & 0xffu32) | ((len & 0xffu32) << 8u32))
+}
+
+/// Extracts bits in range [`start`, `start` + `length`) from `a` into
+/// the least significant bits of the result.
+#[inline(always)]
+#[target_feature = "+tbm"]
+pub fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 {
+    _bextr2_u64(a, (start & 0xffu64) | ((len & 0xffu64) << 8u64))
+}
+
+/// Extracts bits of `a` specified by `control` into
+/// the least significant bits of the result.
+///
+/// Bits [7,0] of `control` specify the index to the first bit in the range to be
+/// extracted, and bits [15,8] specify the length of the range.
+#[inline(always)]
+#[target_feature = "+tbm"]
+pub fn _bextr2_u32(a: u32, control: u32) -> u32 {
+    unsafe { x86_tbm_bextri_u32(a, control) }
+}
+
+/// Extracts bits of `a` specified by `control` into
+/// the least significant bits of the result.
+///
+/// Bits [7,0] of `control` specify the index to the first bit in the range to be
+/// extracted, and bits [15,8] specify the length of the range.
+#[inline(always)]
+#[target_feature = "+tbm"]
+pub fn _bextr2_u64(a: u64, control: u64) -> u64 {
+    unsafe { x86_tbm_bextri_u64(a, control) }
+}
+*/
+
+/// Clears all bits below the least significant zero bit of `x`.
+///
+/// If there is no zero bit in `x`, it returns zero.
+#[inline(always)]
+#[target_feature = "+tbm"]
+#[cfg_attr(test, assert_instr(blcfill))]
+pub fn _blcfill_u32(x: u32) -> u32 {
+    x & (x.wrapping_add(1))
+}
+
+/// Clears all bits below the least significant zero bit of `x`.
+///
+/// If there is no zero bit in `x`, it returns zero.
+#[inline(always)]
+#[target_feature = "+tbm"]
+#[cfg_attr(test, assert_instr(blcfill))]
+pub fn _blcfill_u64(x: u64) -> u64 {
+    x & (x.wrapping_add(1))
+}
+
+/// Sets all bits of `x` to 1 except for the least significant zero bit.
+///
+/// If there is no zero bit in `x`, it sets all bits.
+#[inline(always)]
+#[target_feature = "+tbm"]
+#[cfg_attr(test, assert_instr(blci))]
+pub fn _blci_u32(x: u32) -> u32 {
+    x | !(x.wrapping_add(1))
+}
+
+/// Sets all bits of `x` to 1 except for the least significant zero bit.
+///
+/// If there is no zero bit in `x`, it sets all bits.
+#[inline(always)]
+#[target_feature = "+tbm"]
+#[cfg_attr(test, assert_instr(blci))]
+pub fn _blci_u64(x: u64) -> u64 {
+    x | !(x.wrapping_add(1))
+}
+
+/// Sets the least significant zero bit of `x` and clears all other bits.
+///
+/// If there is no zero bit in `x`, it returns zero.
+#[inline(always)]
+#[target_feature = "+tbm"]
+#[cfg_attr(test, assert_instr(blcic))]
+pub fn _blcic_u32(x: u32) -> u32 {
+    !x & (x.wrapping_add(1))
+}
+
+/// Sets the least significant zero bit of `x` and clears all other bits.
+///
+/// If there is no zero bit in `x`, it returns zero.
+#[inline(always)]
+#[target_feature = "+tbm"]
+#[cfg_attr(test, assert_instr(blcic))]
+pub fn _blcic_u64(x: u64) -> u64 {
+    !x & (x.wrapping_add(1))
+}
+
+/// Sets the least significant zero bit of `x` and clears all bits above that bit.
+///
+/// If there is no zero bit in `x`, it sets all the bits.
+#[inline(always)]
+#[target_feature = "+tbm"]
+#[cfg_attr(test, assert_instr(blcmsk))]
+pub fn _blcmsk_u32(x: u32) -> u32 {
+    x ^ (x.wrapping_add(1))
+}
+
+/// Sets the least significant zero bit of `x` and clears all bits above that bit.
+///
+/// If there is no zero bit in `x`, it sets all the bits.
+#[inline(always)]
+#[target_feature = "+tbm"]
+#[cfg_attr(test, assert_instr(blcmsk))]
+pub fn _blcmsk_u64(x: u64) -> u64 {
+    x ^ (x.wrapping_add(1))
+}
+
+/// Sets the least significant zero bit of `x`.
+///
+/// If there is no zero bit in `x`, it returns `x`.
+#[inline(always)]
+#[target_feature = "+tbm"]
+#[cfg_attr(test, assert_instr(blcs))]
+pub fn _blcs_u32(x: u32) -> u32 {
+    x | (x.wrapping_add(1))
+}
+
+/// Sets the least significant zero bit of `x`.
+///
+/// If there is no zero bit in `x`, it returns `x`.
+#[inline(always)]
+#[target_feature = "+tbm"]
+#[cfg_attr(test, assert_instr(blcs))]
+pub fn _blcs_u64(x: u64) -> u64 {
+    x | x.wrapping_add(1)
+}
+
+/// Sets all bits of `x` below the least significant one.
+///
+/// If there is no set bit in `x`, it sets all the bits.
+#[inline(always)]
+#[target_feature = "+tbm"]
+#[cfg_attr(test, assert_instr(blsfill))]
+pub fn _blsfill_u32(x: u32) -> u32 {
+    x | (x.wrapping_sub(1))
+}
+
+/// Sets all bits of `x` below the least significant one.
+///
+/// If there is no set bit in `x`, it sets all the bits.
+#[inline(always)]
+#[target_feature = "+tbm"]
+#[cfg_attr(test, assert_instr(blsfill))]
+pub fn _blsfill_u64(x: u64) -> u64 {
+    x | (x.wrapping_sub(1))
+}
+
+/// Clears least significant bit and sets all other bits.
+///
+/// If there is no set bit in `x`, it sets all the bits.
+#[inline(always)]
+#[target_feature = "+tbm"]
+#[cfg_attr(test, assert_instr(blsic))]
+pub fn _blsic_u32(x: u32) -> u32 {
+    !x | (x.wrapping_sub(1))
+}
+
+/// Clears least significant bit and sets all other bits.
+///
+/// If there is no set bit in `x`, it sets all the bits.
+#[inline(always)]
+#[target_feature = "+tbm"]
+#[cfg_attr(test, assert_instr(blsic))]
+pub fn _blsic_u64(x: u64) -> u64 {
+    !x | (x.wrapping_sub(1))
+}
+
+/// Clears all bits below the least significant zero of `x` and sets all other
+/// bits.
+///
+/// If the least significant bit of `x` is 0, it sets all bits.
+#[inline(always)]
+#[target_feature = "+tbm"]
+#[cfg_attr(test, assert_instr(t1mskc))]
+pub fn _t1mskc_u32(x: u32) -> u32 {
+    !x | (x.wrapping_add(1))
+}
+
+/// Clears all bits below the least significant zero of `x` and sets all other
+/// bits.
+///
+/// If the least significant bit of `x` is 0, it sets all bits.
+#[inline(always)]
+#[target_feature = "+tbm"]
+#[cfg_attr(test, assert_instr(t1mskc))]
+pub fn _t1mskc_u64(x: u64) -> u64 {
+    !x | (x.wrapping_add(1))
+}
+
+/// Sets all bits below the least significant one of `x` and clears all other
+/// bits.
+///
+/// If the least significant bit of `x` is 1, it returns zero.
+#[inline(always)]
+#[target_feature = "+tbm"]
+#[cfg_attr(test, assert_instr(tzmsk))]
+pub fn _tzmsk_u32(x: u32) -> u32 {
+    !x & (x.wrapping_sub(1))
+}
+
+/// Sets all bits below the least significant one of `x` and clears all other
+/// bits.
+///
+/// If the least significant bit of `x` is 1, it returns zero.
+#[inline(always)]
+#[target_feature = "+tbm"]
+#[cfg_attr(test, assert_instr(tzmsk))]
+pub fn _tzmsk_u64(x: u64) -> u64 {
+    !x & (x.wrapping_sub(1))
+}
+
+#[cfg(all(test, target_feature = "tbm", any(target_arch = "x86", target_arch = "x86_64")))]
+mod tests {
+    use x86::tbm;
+
+    /*
+    #[test]
+    #[target_feature = "+tbm"]
+    fn _bextr_u32() {
+        assert_eq!(tbm::_bextr_u32(0b0101_0000u32, 4, 4), 0b0000_0101u32);
+    }
+
+    #[test]
+    #[target_feature = "+tbm"]
+    fn _bextr_u64() {
+        assert_eq!(tbm::_bextr_u64(0b0101_0000u64, 4, 4), 0b0000_0101u64);
+    }
+    */
+
+    #[test]
+    #[target_feature = "+tbm"]
+    fn _blcfill_u32() {
+        assert_eq!(tbm::_blcfill_u32(0b0101_0111u32), 0b0101_0000u32);
+        assert_eq!(tbm::_blcfill_u32(0b1111_1111u32), 0u32);
+    }
+
+    #[test]
+    #[target_feature = "+tbm"]
+    fn _blcfill_u64() {
+        assert_eq!(tbm::_blcfill_u64(0b0101_0111u64), 0b0101_0000u64);
+        assert_eq!(tbm::_blcfill_u64(0b1111_1111u64), 0u64);
+    }
+
+    #[test]
+    #[target_feature = "+tbm"]
+    fn _blci_u32() {
+        assert_eq!(tbm::_blci_u32(0b0101_0000u32),
+                   0b1111_1111_1111_1111_1111_1111_1111_1110u32);
+        assert_eq!(tbm::_blci_u32(0b1111_1111u32),
+                   0b1111_1111_1111_1111_1111_1110_1111_1111u32);
+    }
+
+    #[test]
+    #[target_feature = "+tbm"]
+    fn _blci_u64() {
+        assert_eq!(tbm::_blci_u64(0b0101_0000u64),
+                   0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110u64);
+        assert_eq!(tbm::_blci_u64(0b1111_1111u64),
+                   0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110_1111_1111u64);
+    }
+
+    #[test]
+    #[target_feature = "+tbm"]
+    fn _blcic_u32() {
+        assert_eq!(tbm::_blcic_u32(0b0101_0001u32), 0b0000_0010u32);
+        assert_eq!(tbm::_blcic_u32(0b1111_1111u32), 0b1_0000_0000u32);
+    }
+
+    #[test]
+    #[target_feature = "+tbm"]
+    fn _blcic_u64() {
+        assert_eq!(tbm::_blcic_u64(0b0101_0001u64), 0b0000_0010u64);
+        assert_eq!(tbm::_blcic_u64(0b1111_1111u64), 0b1_0000_0000u64);
+    }
+
+    #[test]
+    #[target_feature = "+tbm"]
+    fn _blcmsk_u32() {
+        assert_eq!(tbm::_blcmsk_u32(0b0101_0001u32), 0b0000_0011u32);
+        assert_eq!(tbm::_blcmsk_u32(0b1111_1111u32), 0b1_1111_1111u32);
+    }
+
+    #[test]
+    #[target_feature = "+tbm"]
+    fn _blcmsk_u64() {
+        assert_eq!(tbm::_blcmsk_u64(0b0101_0001u64), 0b0000_0011u64);
+        assert_eq!(tbm::_blcmsk_u64(0b1111_1111u64), 0b1_1111_1111u64);
+    }
+
+    #[test]
+    #[target_feature = "+tbm"]
+    fn _blcs_u32() {
+       assert_eq!(tbm::_blcs_u32(0b0101_0001u32), 0b0101_0011u32);
+       assert_eq!(tbm::_blcs_u32(0b1111_1111u32), 0b1_1111_1111u32);
+    }
+
+    #[test]
+    #[target_feature = "+tbm"]
+    fn _blcs_u64() {
+       assert_eq!(tbm::_blcs_u64(0b0101_0001u64), 0b0101_0011u64);
+       assert_eq!(tbm::_blcs_u64(0b1111_1111u64), 0b1_1111_1111u64);
+    }
+
+    #[test]
+    #[target_feature = "+tbm"]
+    fn _blsfill_u32() {
+        assert_eq!(tbm::_blsfill_u32(0b0101_0100u32), 0b0101_0111u32);
+        assert_eq!(tbm::_blsfill_u32(0u32), 0b1111_1111_1111_1111_1111_1111_1111_1111u32);
+    }
+
+    #[test]
+    #[target_feature = "+tbm"]
+    fn _blsfill_u64() {
+        assert_eq!(tbm::_blsfill_u64(0b0101_0100u64), 0b0101_0111u64);
+        assert_eq!(tbm::_blsfill_u64(0u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
+    }
+
+    #[test]
+    #[target_feature = "+tbm"]
+    fn _blsic_u32() {
+        assert_eq!(tbm::_blsic_u32(0b0101_0100u32), 0b1111_1111_1111_1111_1111_1111_1111_1011u32);
+        assert_eq!(tbm::_blsic_u32(0u32), 0b1111_1111_1111_1111_1111_1111_1111_1111u32);
+    }
+
+    #[test]
+    #[target_feature = "+tbm"]
+    fn _blsic_u64() {
+        assert_eq!(tbm::_blsic_u64(0b0101_0100u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1011u64);
+       assert_eq!(tbm::_blsic_u64(0u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
+    }
+
+    #[test]
+    #[target_feature = "+tbm"]
+    fn _t1mskc_u32() {
+       assert_eq!(tbm::_t1mskc_u32(0b0101_0111u32), 0b1111_1111_1111_1111_1111_1111_1111_1000u32);
+       assert_eq!(tbm::_t1mskc_u32(0u32), 0b1111_1111_1111_1111_1111_1111_1111_1111u32);
+    }
+
+    #[test]
+    #[target_feature = "+tbm"]
+    fn _t1mksc_u64() {
+       assert_eq!(tbm::_t1mskc_u64(0b0101_0111u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1000u64);
+       assert_eq!(tbm::_t1mskc_u64(0u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
+    }
+
+    #[test]
+    #[target_feature = "+tbm"]
+    fn _tzmsk_u32() {
+        assert_eq!(tbm::_tzmsk_u32(0b0101_1000u32), 0b0000_0111u32);
+        assert_eq!(tbm::_tzmsk_u32(0b0101_1001u32), 0b0000_0000u32);
+    }
+
+    #[test]
+    #[target_feature = "+tbm"]
+    fn _tzmsk_u64() {
+        assert_eq!(tbm::_tzmsk_u64(0b0101_1000u64), 0b0000_0111u64);
+        assert_eq!(tbm::_tzmsk_u64(0b0101_1001u64), 0b0000_0000u64);
+    }
+}