From 9b03920fe17ddb1e61a179a7a83fcb00ce740e98 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Thu, 11 Oct 2018 14:43:19 -0700
Subject: [PATCH 1/2] Optimize intrinsics on wasm32

Profiling a recent demo I was playing with on `wasm32-unknown-unknown`
pointed me to the surprising result that 15% of the execution time was
in the `sqrt` intrinsic (there's a lot of math here). Upon investigation
I remembered that wasm (unconditionally) has a native `f32.sqrt`
instruction!

I was then subsequently confused that a simple `f.sqrt()` actually
codegens to use `f32.sqrt` in Rust, but I later realized that the
implementations of intrinsics in this library often use other intrinsics
to implement them. That means that the real intrinsic here, `acos`,
internally called `sqrt` at some point but wasn't using the optimized
implementation!

To help fix this situation this PR is intended on providing the
infrastructure for optimized implementations (via code generation) to be
used for each intrinsic. I've gone thorugh the various math instructions
that wasm has available and updated each of the intrinsic
implementations in this crate to optionally use the LLVM intrinsic
versions, which are known to unconditionally compile down to a single
instruction (unlike the arbitrary platform, where we don't know what it
will compile down to!).

To do this I created a new macro to wrap the invocation of LLVM
intrinsics. Invoking LLVM intrinsics is turned off by default (through a
new and on-by-default feature, `stable`). When the `stable` feature is
disabled, however, then the wasm-target specifically will enable usage
of the LLVM intrinsics. I've additionally added a CI builder which
should verify that these continue to build on Travis.

After this I intended to update the submodule in the `compiler-builtins`
repository so we can pull in the optimized implementation there, and
`compiler-builtins` naturally won't set `feature = "stable"` when
compiling so all the intrinsics should get compiled in by default. After
a further update of `the libcompiler_builtins` submodule in
rust-lang/rust we should be good to go!
---
 library/compiler-builtins/libm/.travis.yml        |  7 +++++++
 library/compiler-builtins/libm/Cargo.toml         |  4 ++++
 library/compiler-builtins/libm/src/lib.rs         |  4 ++++
 library/compiler-builtins/libm/src/math/ceil.rs   |  8 ++++++++
 library/compiler-builtins/libm/src/math/ceilf.rs  |  8 ++++++++
 library/compiler-builtins/libm/src/math/fabs.rs   |  8 ++++++++
 library/compiler-builtins/libm/src/math/fabsf.rs  |  8 ++++++++
 library/compiler-builtins/libm/src/math/floor.rs  |  8 ++++++++
 library/compiler-builtins/libm/src/math/floorf.rs |  8 ++++++++
 library/compiler-builtins/libm/src/math/mod.rs    | 11 +++++++++++
 library/compiler-builtins/libm/src/math/sqrt.rs   | 12 ++++++++++++
 library/compiler-builtins/libm/src/math/sqrtf.rs  | 12 ++++++++++++
 library/compiler-builtins/libm/src/math/trunc.rs  |  8 ++++++++
 library/compiler-builtins/libm/src/math/truncf.rs |  8 ++++++++
 14 files changed, 114 insertions(+)

diff --git a/library/compiler-builtins/libm/.travis.yml b/library/compiler-builtins/libm/.travis.yml
index 47f2b2f205df..758316178196 100644
--- a/library/compiler-builtins/libm/.travis.yml
+++ b/library/compiler-builtins/libm/.travis.yml
@@ -29,6 +29,13 @@ matrix:
     - env: TARGET=cargo-fmt
       rust: beta
 
+    - env: TARGET=wasm32-unknown-unknown
+      rust: nightly
+      install: rustup target add $TARGET
+      script:
+        - cargo build --target $TARGET
+        - cargo build --no-default-features --target $TARGET
+
 before_install: set -e
 
 install:
diff --git a/library/compiler-builtins/libm/Cargo.toml b/library/compiler-builtins/libm/Cargo.toml
index cedf8d267db8..f7a528334241 100644
--- a/library/compiler-builtins/libm/Cargo.toml
+++ b/library/compiler-builtins/libm/Cargo.toml
@@ -24,3 +24,7 @@ members = [
 
 [dev-dependencies]
 shared = { path = "shared" }
+
+[features]
+default = ['stable']
+stable = []
diff --git a/library/compiler-builtins/libm/src/lib.rs b/library/compiler-builtins/libm/src/lib.rs
index 627c6443e3dc..6be458728197 100644
--- a/library/compiler-builtins/libm/src/lib.rs
+++ b/library/compiler-builtins/libm/src/lib.rs
@@ -11,6 +11,10 @@
 
 #![deny(warnings)]
 #![no_std]
+#![cfg_attr(
+    all(target_arch = "wasm32", not(feature = "stable")),
+    feature(core_intrinsics)
+)]
 
 mod math;
 
diff --git a/library/compiler-builtins/libm/src/math/ceil.rs b/library/compiler-builtins/libm/src/math/ceil.rs
index 4db2ca840368..5dbfa6a2c0ad 100644
--- a/library/compiler-builtins/libm/src/math/ceil.rs
+++ b/library/compiler-builtins/libm/src/math/ceil.rs
@@ -4,6 +4,14 @@ const TOINT: f64 = 1. / f64::EPSILON;
 
 #[inline]
 pub fn ceil(x: f64) -> f64 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f64.ceil` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return unsafe { ::core::intrinsics::ceilf64(x) }
+        }
+    }
     let u: u64 = x.to_bits();
     let e: i64 = (u >> 52 & 0x7ff) as i64;
     let y: f64;
diff --git a/library/compiler-builtins/libm/src/math/ceilf.rs b/library/compiler-builtins/libm/src/math/ceilf.rs
index 16bffb3002e5..c8cd4b5aa5b5 100644
--- a/library/compiler-builtins/libm/src/math/ceilf.rs
+++ b/library/compiler-builtins/libm/src/math/ceilf.rs
@@ -2,6 +2,14 @@ use core::f32;
 
 #[inline]
 pub fn ceilf(x: f32) -> f32 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f32.ceil` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return unsafe { ::core::intrinsics::ceilf32(x) }
+        }
+    }
     let mut ui = x.to_bits();
     let e = (((ui >> 23) & 0xff) - 0x7f) as i32;
 
diff --git a/library/compiler-builtins/libm/src/math/fabs.rs b/library/compiler-builtins/libm/src/math/fabs.rs
index 9e081f3f9f69..7c804653c996 100644
--- a/library/compiler-builtins/libm/src/math/fabs.rs
+++ b/library/compiler-builtins/libm/src/math/fabs.rs
@@ -2,5 +2,13 @@ use core::u64;
 
 #[inline]
 pub fn fabs(x: f64) -> f64 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f64.abs` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return unsafe { ::core::intrinsics::fabsf64(x) }
+        }
+    }
     f64::from_bits(x.to_bits() & (u64::MAX / 2))
 }
diff --git a/library/compiler-builtins/libm/src/math/fabsf.rs b/library/compiler-builtins/libm/src/math/fabsf.rs
index 4cc9411169ab..884c20f6c410 100644
--- a/library/compiler-builtins/libm/src/math/fabsf.rs
+++ b/library/compiler-builtins/libm/src/math/fabsf.rs
@@ -1,4 +1,12 @@
 #[inline]
 pub fn fabsf(x: f32) -> f32 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f32.abs` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return unsafe { ::core::intrinsics::fabsf32(x) }
+        }
+    }
     f32::from_bits(x.to_bits() & 0x7fffffff)
 }
diff --git a/library/compiler-builtins/libm/src/math/floor.rs b/library/compiler-builtins/libm/src/math/floor.rs
index 997865d39e93..b14a48d55bc7 100644
--- a/library/compiler-builtins/libm/src/math/floor.rs
+++ b/library/compiler-builtins/libm/src/math/floor.rs
@@ -4,6 +4,14 @@ const TOINT: f64 = 1. / f64::EPSILON;
 
 #[inline]
 pub fn floor(x: f64) -> f64 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f64.floor` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return unsafe { ::core::intrinsics::floorf64(x) }
+        }
+    }
     let ui = x.to_bits();
     let e = ((ui >> 52) & 0x7ff) as i32;
 
diff --git a/library/compiler-builtins/libm/src/math/floorf.rs b/library/compiler-builtins/libm/src/math/floorf.rs
index 9c263b51828b..71b5953df3e2 100644
--- a/library/compiler-builtins/libm/src/math/floorf.rs
+++ b/library/compiler-builtins/libm/src/math/floorf.rs
@@ -2,6 +2,14 @@ use core::f32;
 
 #[inline]
 pub fn floorf(x: f32) -> f32 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f32.floor` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return unsafe { ::core::intrinsics::floorf32(x) }
+        }
+    }
     let mut ui = x.to_bits();
     let e = (((ui >> 23) & 0xff) - 0x7f) as i32;
 
diff --git a/library/compiler-builtins/libm/src/math/mod.rs b/library/compiler-builtins/libm/src/math/mod.rs
index da34fb4cecd1..e51b1511dfcb 100644
--- a/library/compiler-builtins/libm/src/math/mod.rs
+++ b/library/compiler-builtins/libm/src/math/mod.rs
@@ -58,6 +58,17 @@ macro_rules! i {
     };
 }
 
+macro_rules! llvm_intrinsically_optimized {
+    (#[cfg($($clause:tt)*)] $e:expr) => {
+        #[cfg(all(not(feature = "stable"), $($clause)*))]
+        {
+            if true { // thwart the dead code lint
+                $e
+            }
+        }
+    };
+}
+
 // Public modules
 mod acos;
 mod acosf;
diff --git a/library/compiler-builtins/libm/src/math/sqrt.rs b/library/compiler-builtins/libm/src/math/sqrt.rs
index cbadb49bba03..b2387a26e750 100644
--- a/library/compiler-builtins/libm/src/math/sqrt.rs
+++ b/library/compiler-builtins/libm/src/math/sqrt.rs
@@ -82,6 +82,18 @@ const TINY: f64 = 1.0e-300;
 
 #[inline]
 pub fn sqrt(x: f64) -> f64 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f64.sqrt` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return if x < 0.0 {
+                f64::NAN
+            } else {
+                unsafe { ::core::intrinsics::sqrtf64(x) }
+            }
+        }
+    }
     let mut z: f64;
     let sign: u32 = 0x80000000;
     let mut ix0: i32;
diff --git a/library/compiler-builtins/libm/src/math/sqrtf.rs b/library/compiler-builtins/libm/src/math/sqrtf.rs
index 49984689efc2..33cafbcbda36 100644
--- a/library/compiler-builtins/libm/src/math/sqrtf.rs
+++ b/library/compiler-builtins/libm/src/math/sqrtf.rs
@@ -17,6 +17,18 @@ const TINY: f32 = 1.0e-30;
 
 #[inline]
 pub fn sqrtf(x: f32) -> f32 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f32.sqrt` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return if x < 0.0 {
+                ::core::f32::NAN
+            } else {
+                unsafe { ::core::intrinsics::sqrtf32(x) }
+            }
+        }
+    }
     let mut z: f32;
     let sign: i32 = 0x80000000u32 as i32;
     let mut ix: i32;
diff --git a/library/compiler-builtins/libm/src/math/trunc.rs b/library/compiler-builtins/libm/src/math/trunc.rs
index 6bea67cbc165..8eecfcf538e5 100644
--- a/library/compiler-builtins/libm/src/math/trunc.rs
+++ b/library/compiler-builtins/libm/src/math/trunc.rs
@@ -2,6 +2,14 @@ use core::f64;
 
 #[inline]
 pub fn trunc(x: f64) -> f64 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f64.trunc` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return unsafe { ::core::intrinsics::truncf64(x) }
+        }
+    }
     let x1p120 = f64::from_bits(0x4770000000000000); // 0x1p120f === 2 ^ 120
 
     let mut i: u64 = x.to_bits();
diff --git a/library/compiler-builtins/libm/src/math/truncf.rs b/library/compiler-builtins/libm/src/math/truncf.rs
index 9d42620d9666..0d74fea9c9ee 100644
--- a/library/compiler-builtins/libm/src/math/truncf.rs
+++ b/library/compiler-builtins/libm/src/math/truncf.rs
@@ -2,6 +2,14 @@ use core::f32;
 
 #[inline]
 pub fn truncf(x: f32) -> f32 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f32.trunc` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return unsafe { ::core::intrinsics::truncf32(x) }
+        }
+    }
     let x1p120 = f32::from_bits(0x7b800000); // 0x1p120f === 2 ^ 120
 
     let mut i: u32 = x.to_bits();

From 5c6fd41b3deab08ab7cbbfec15bc8ce5af593a1c Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Wed, 24 Oct 2018 00:18:30 +0200
Subject: [PATCH 2/2] merge [features] tables

---
 library/compiler-builtins/libm/Cargo.toml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/library/compiler-builtins/libm/Cargo.toml b/library/compiler-builtins/libm/Cargo.toml
index f7a528334241..f28024d041c5 100644
--- a/library/compiler-builtins/libm/Cargo.toml
+++ b/library/compiler-builtins/libm/Cargo.toml
@@ -12,6 +12,8 @@ version = "0.1.2"
 [features]
 # only used to run our test suite
 checked = []
+default = ['stable']
+stable = []
 
 [workspace]
 members = [
@@ -24,7 +26,3 @@ members = [
 
 [dev-dependencies]
 shared = { path = "shared" }
-
-[features]
-default = ['stable']
-stable = []