From 1b7346bf5fb63c4c74ad492e7a566b01feec6f56 Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Mon, 28 Oct 2024 19:30:04 -0500
Subject: [PATCH 1/3] Introduce `math::arch::intrinsics`

This module provides implementations of basic functions that defer to
LLVM for what to do, rather than either using a builtin operation or
calling another function in this library.

`math::arch` will become the home of anything architecture-specific in
the future.
---
 .../libm/src/math/arch/intrinsics.rs          | 52 +++++++++++++++++++
 .../libm/src/math/arch/mod.rs                 |  9 ++++
 .../compiler-builtins/libm/src/math/mod.rs    |  1 +
 3 files changed, 62 insertions(+)
 create mode 100644 library/compiler-builtins/libm/src/math/arch/intrinsics.rs
 create mode 100644 library/compiler-builtins/libm/src/math/arch/mod.rs

diff --git a/library/compiler-builtins/libm/src/math/arch/intrinsics.rs b/library/compiler-builtins/libm/src/math/arch/intrinsics.rs
new file mode 100644
index 000000000000..1cf9291f4c75
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/arch/intrinsics.rs
@@ -0,0 +1,52 @@
+// Config is needed for times when this module is available but we don't call everything
+#![allow(dead_code)]
+
+pub fn ceil(x: f64) -> f64 {
+    // SAFETY: safe intrinsic with no preconditions
+    unsafe { core::intrinsics::ceilf64(x) }
+}
+
+pub fn ceilf(x: f32) -> f32 {
+    // SAFETY: safe intrinsic with no preconditions
+    unsafe { core::intrinsics::ceilf32(x) }
+}
+
+pub fn fabs(x: f64) -> f64 {
+    // SAFETY: safe intrinsic with no preconditions
+    unsafe { core::intrinsics::fabsf64(x) }
+}
+
+pub fn fabsf(x: f32) -> f32 {
+    // SAFETY: safe intrinsic with no preconditions
+    unsafe { core::intrinsics::fabsf32(x) }
+}
+
+pub fn floor(x: f64) -> f64 {
+    // SAFETY: safe intrinsic with no preconditions
+    unsafe { core::intrinsics::floorf64(x) }
+}
+
+pub fn floorf(x: f32) -> f32 {
+    // SAFETY: safe intrinsic with no preconditions
+    unsafe { core::intrinsics::floorf32(x) }
+}
+
+pub fn sqrt(x: f64) -> f64 {
+    // SAFETY: safe intrinsic with no preconditions
+    unsafe { core::intrinsics::sqrtf64(x) }
+}
+
+pub fn sqrtf(x: f32) -> f32 {
+    // SAFETY: safe intrinsic with no preconditions
+    unsafe { core::intrinsics::sqrtf32(x) }
+}
+
+pub fn trunc(x: f64) -> f64 {
+    // SAFETY: safe intrinsic with no preconditions
+    unsafe { core::intrinsics::truncf64(x) }
+}
+
+pub fn truncf(x: f32) -> f32 {
+    // SAFETY: safe intrinsic with no preconditions
+    unsafe { core::intrinsics::truncf32(x) }
+}
diff --git a/library/compiler-builtins/libm/src/math/arch/mod.rs b/library/compiler-builtins/libm/src/math/arch/mod.rs
new file mode 100644
index 000000000000..a4bc218b743d
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/arch/mod.rs
@@ -0,0 +1,9 @@
+//! Architecture-specific routines and operations.
+//!
+//! LLVM will already optimize calls to some of these in cases that there are hardware
+//! instructions. Providing an implementation here just ensures that the faster implementation
+//! is used when calling the function directly. This helps anyone who uses `libm` directly, as
+//! well as improving things when these routines are called as part of other implementations.
+
+#[cfg(intrinsics_enabled)]
+pub mod intrinsics;
diff --git a/library/compiler-builtins/libm/src/math/mod.rs b/library/compiler-builtins/libm/src/math/mod.rs
index 9baa57fc8825..e3e6846d377f 100644
--- a/library/compiler-builtins/libm/src/math/mod.rs
+++ b/library/compiler-builtins/libm/src/math/mod.rs
@@ -302,6 +302,7 @@ pub use self::trunc::trunc;
 pub use self::truncf::truncf;
 
 // Private modules
+mod arch;
 mod expo2;
 mod fenv;
 mod k_cos;

From d54896343cab56e2f4c9866e54e7954c9c70d753 Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Sat, 26 Oct 2024 02:56:22 -0500
Subject: [PATCH 2/3] Introduce a `select_implementation` macro

Currently there is a macro called `llvm_intrinsically_optimized` that
uses an intrinsic rather than the function implementation if the
configuration is correct. Add a new macro `select_implementation` that
is somewhat cleaner to use.

In the future, we can update this macro with more fields to specify
other implementations that may be selected, such as something
architecture-specific or e.g. using a generic implementation for `f32`
routines, rather than those that convert to `f64`.

This introduces a `macros` module within `math/support`. We will be able
to move more things here later.
---
 .../compiler-builtins/libm/src/math/mod.rs    | 63 ++++++++++---------
 .../libm/src/math/support/macros.rs           | 34 ++++++++++
 .../libm/src/math/support/mod.rs              |  2 +
 3 files changed, 69 insertions(+), 30 deletions(-)
 create mode 100644 library/compiler-builtins/libm/src/math/support/macros.rs
 create mode 100644 library/compiler-builtins/libm/src/math/support/mod.rs

diff --git a/library/compiler-builtins/libm/src/math/mod.rs b/library/compiler-builtins/libm/src/math/mod.rs
index e3e6846d377f..a7e16bfc865c 100644
--- a/library/compiler-builtins/libm/src/math/mod.rs
+++ b/library/compiler-builtins/libm/src/math/mod.rs
@@ -74,6 +74,7 @@ macro_rules! div {
     };
 }
 
+// FIXME: phase this out, to be replaced by the more flexible `select_implementation`
 macro_rules! llvm_intrinsically_optimized {
     (#[cfg($($clause:tt)*)] $e:expr) => {
         #[cfg(all(intrinsics_enabled, not(feature = "force-soft-floats"), $($clause)*))]
@@ -85,6 +86,38 @@ macro_rules! llvm_intrinsically_optimized {
     };
 }
 
+// Private modules
+#[macro_use]
+mod support;
+mod arch;
+mod expo2;
+mod fenv;
+mod k_cos;
+mod k_cosf;
+mod k_expo2;
+mod k_expo2f;
+mod k_sin;
+mod k_sinf;
+mod k_tan;
+mod k_tanf;
+mod rem_pio2;
+mod rem_pio2_large;
+mod rem_pio2f;
+
+// Private re-imports
+use self::expo2::expo2;
+use self::k_cos::k_cos;
+use self::k_cosf::k_cosf;
+use self::k_expo2::k_expo2;
+use self::k_expo2f::k_expo2f;
+use self::k_sin::k_sin;
+use self::k_sinf::k_sinf;
+use self::k_tan::k_tan;
+use self::k_tanf::k_tanf;
+use self::rem_pio2::rem_pio2;
+use self::rem_pio2_large::rem_pio2_large;
+use self::rem_pio2f::rem_pio2f;
+
 // Public modules
 mod acos;
 mod acosf;
@@ -301,36 +334,6 @@ pub use self::tgammaf::tgammaf;
 pub use self::trunc::trunc;
 pub use self::truncf::truncf;
 
-// Private modules
-mod arch;
-mod expo2;
-mod fenv;
-mod k_cos;
-mod k_cosf;
-mod k_expo2;
-mod k_expo2f;
-mod k_sin;
-mod k_sinf;
-mod k_tan;
-mod k_tanf;
-mod rem_pio2;
-mod rem_pio2_large;
-mod rem_pio2f;
-
-// Private re-imports
-use self::expo2::expo2;
-use self::k_cos::k_cos;
-use self::k_cosf::k_cosf;
-use self::k_expo2::k_expo2;
-use self::k_expo2f::k_expo2f;
-use self::k_sin::k_sin;
-use self::k_sinf::k_sinf;
-use self::k_tan::k_tan;
-use self::k_tanf::k_tanf;
-use self::rem_pio2::rem_pio2;
-use self::rem_pio2_large::rem_pio2_large;
-use self::rem_pio2f::rem_pio2f;
-
 #[inline]
 fn get_high_word(x: f64) -> u32 {
     (x.to_bits() >> 32) as u32
diff --git a/library/compiler-builtins/libm/src/math/support/macros.rs b/library/compiler-builtins/libm/src/math/support/macros.rs
new file mode 100644
index 000000000000..6bc75837a349
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/support/macros.rs
@@ -0,0 +1,34 @@
+/// Choose among using an intrinsic (if available) and falling back to the default function body.
+/// Returns directly if the intrinsic version is used, otherwise continues to the rest of the
+/// function.
+///
+/// Use this if the intrinsic is likely to be more performant on the platform(s) specified
+/// in `intrinsic_available`.
+///
+/// The `cfg` used here is controlled by `build.rs` so the passed meta does not need to account
+/// for e.g. the `unstable-intrinsics` or `force-soft-float` features.
+macro_rules! select_implementation {
+    (
+        name: $fname:ident,
+        // Configuration meta for when to call intrinsics and let LLVM figure it out
+        $( use_intrinsic: $use_intrinsic:meta, )?
+        args: $($arg:ident),+ ,
+    ) => {
+        // FIXME: these use paths that are a pretty fragile (`super`). We should figure out
+        // something better w.r.t. how this is vendored into compiler-builtins.
+
+        // Never use intrinsics if we are forcing soft floats, and only enable with the
+        // `unstable-intrinsics` feature.
+        #[cfg(intrinsics_enabled)]
+        select_implementation! {
+            @cfg $( $use_intrinsic )?;
+            if true {
+                return  super::arch::intrinsics::$fname( $($arg),+ );
+            }
+        }
+    };
+
+    // Coalesce helper to construct an expression only if a config is provided
+    (@cfg ; $ex:expr) => { };
+    (@cfg $provided:meta; $ex:expr) => { #[cfg($provided)] $ex };
+}
diff --git a/library/compiler-builtins/libm/src/math/support/mod.rs b/library/compiler-builtins/libm/src/math/support/mod.rs
new file mode 100644
index 000000000000..10532f0d115a
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/support/mod.rs
@@ -0,0 +1,2 @@
+#[macro_use]
+pub mod macros;

From 60e7e3b338c3aea7b615073d712c9c568e2a0e9a Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Mon, 28 Oct 2024 19:38:19 -0500
Subject: [PATCH 3/3] Make use of `select_implementation`

Replace all uses of `llvm_intrinsically` with select_implementation`.
---
 library/compiler-builtins/libm/src/math/ceil.rs  | 12 +++++-------
 library/compiler-builtins/libm/src/math/ceilf.rs | 12 +++++-------
 library/compiler-builtins/libm/src/math/fabs.rs  | 12 +++++-------
 library/compiler-builtins/libm/src/math/fabsf.rs | 12 +++++-------
 library/compiler-builtins/libm/src/math/floor.rs | 12 +++++-------
 .../compiler-builtins/libm/src/math/floorf.rs    | 12 +++++-------
 library/compiler-builtins/libm/src/math/mod.rs   | 12 ------------
 library/compiler-builtins/libm/src/math/sqrt.rs  | 16 +++++-----------
 library/compiler-builtins/libm/src/math/sqrtf.rs | 16 +++++-----------
 library/compiler-builtins/libm/src/math/trunc.rs | 12 +++++-------
 .../compiler-builtins/libm/src/math/truncf.rs    | 12 +++++-------
 11 files changed, 50 insertions(+), 90 deletions(-)

diff --git a/library/compiler-builtins/libm/src/math/ceil.rs b/library/compiler-builtins/libm/src/math/ceil.rs
index 1593fdaffcee..0da01b4d0b69 100644
--- a/library/compiler-builtins/libm/src/math/ceil.rs
+++ b/library/compiler-builtins/libm/src/math/ceil.rs
@@ -8,14 +8,12 @@ const TOINT: f64 = 1. / f64::EPSILON;
 /// Finds the nearest integer greater than or equal to `x`.
 #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
 pub fn ceil(x: f64) -> f64 {
-    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
-    // `f64.ceil` native instruction, so we can leverage this for both code size
-    // and speed.
-    llvm_intrinsically_optimized! {
-        #[cfg(target_arch = "wasm32")] {
-            return unsafe { ::core::intrinsics::ceilf64(x) }
-        }
+    select_implementation! {
+        name: ceil,
+        use_intrinsic: target_arch = "wasm32",
+        args: x,
     }
+
     #[cfg(all(target_arch = "x86", not(target_feature = "sse2")))]
     {
         //use an alternative implementation on x86, because the
diff --git a/library/compiler-builtins/libm/src/math/ceilf.rs b/library/compiler-builtins/libm/src/math/ceilf.rs
index bf9ba12279cf..0da384350aee 100644
--- a/library/compiler-builtins/libm/src/math/ceilf.rs
+++ b/library/compiler-builtins/libm/src/math/ceilf.rs
@@ -5,14 +5,12 @@ use core::f32;
 /// Finds the nearest integer greater than or equal to `x`.
 #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
 pub fn ceilf(x: f32) -> f32 {
-    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
-    // `f32.ceil` native instruction, so we can leverage this for both code size
-    // and speed.
-    llvm_intrinsically_optimized! {
-        #[cfg(target_arch = "wasm32")] {
-            return unsafe { ::core::intrinsics::ceilf32(x) }
-        }
+    select_implementation! {
+        name: ceilf,
+        use_intrinsic: target_arch = "wasm32",
+        args: x,
     }
+
     let mut ui = x.to_bits();
     let e = (((ui >> 23) & 0xff).wrapping_sub(0x7f)) as i32;
 
diff --git a/library/compiler-builtins/libm/src/math/fabs.rs b/library/compiler-builtins/libm/src/math/fabs.rs
index 3b0628aa63a7..8d3ea2fd6479 100644
--- a/library/compiler-builtins/libm/src/math/fabs.rs
+++ b/library/compiler-builtins/libm/src/math/fabs.rs
@@ -5,14 +5,12 @@ use core::u64;
 /// by direct manipulation of the bit representation of `x`.
 #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
 pub fn fabs(x: f64) -> f64 {
-    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
-    // `f64.abs` native instruction, so we can leverage this for both code size
-    // and speed.
-    llvm_intrinsically_optimized! {
-        #[cfg(target_arch = "wasm32")] {
-            return unsafe { ::core::intrinsics::fabsf64(x) }
-        }
+    select_implementation! {
+        name: fabs,
+        use_intrinsic: target_arch = "wasm32",
+        args: x,
     }
+
     f64::from_bits(x.to_bits() & (u64::MAX / 2))
 }
 
diff --git a/library/compiler-builtins/libm/src/math/fabsf.rs b/library/compiler-builtins/libm/src/math/fabsf.rs
index f81c8ca44236..1dac6389d8f4 100644
--- a/library/compiler-builtins/libm/src/math/fabsf.rs
+++ b/library/compiler-builtins/libm/src/math/fabsf.rs
@@ -3,14 +3,12 @@
 /// by direct manipulation of the bit representation of `x`.
 #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
 pub fn fabsf(x: f32) -> f32 {
-    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
-    // `f32.abs` native instruction, so we can leverage this for both code size
-    // and speed.
-    llvm_intrinsically_optimized! {
-        #[cfg(target_arch = "wasm32")] {
-            return unsafe { ::core::intrinsics::fabsf32(x) }
-        }
+    select_implementation! {
+        name: fabsf,
+        use_intrinsic: target_arch = "wasm32",
+        args: x,
     }
+
     f32::from_bits(x.to_bits() & 0x7fffffff)
 }
 
diff --git a/library/compiler-builtins/libm/src/math/floor.rs b/library/compiler-builtins/libm/src/math/floor.rs
index e8fb21e5884b..2b9955ebae34 100644
--- a/library/compiler-builtins/libm/src/math/floor.rs
+++ b/library/compiler-builtins/libm/src/math/floor.rs
@@ -8,14 +8,12 @@ const TOINT: f64 = 1. / f64::EPSILON;
 /// Finds the nearest integer less than or equal to `x`.
 #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
 pub fn floor(x: f64) -> f64 {
-    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
-    // `f64.floor` native instruction, so we can leverage this for both code size
-    // and speed.
-    llvm_intrinsically_optimized! {
-        #[cfg(target_arch = "wasm32")] {
-            return unsafe { ::core::intrinsics::floorf64(x) }
-        }
+    select_implementation! {
+        name: floor,
+        use_intrinsic: target_arch = "wasm32",
+        args: x,
     }
+
     #[cfg(all(target_arch = "x86", not(target_feature = "sse2")))]
     {
         //use an alternative implementation on x86, because the
diff --git a/library/compiler-builtins/libm/src/math/floorf.rs b/library/compiler-builtins/libm/src/math/floorf.rs
index f66cab74fdcf..4f38cb15b7d8 100644
--- a/library/compiler-builtins/libm/src/math/floorf.rs
+++ b/library/compiler-builtins/libm/src/math/floorf.rs
@@ -5,14 +5,12 @@ use core::f32;
 /// Finds the nearest integer less than or equal to `x`.
 #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
 pub fn floorf(x: f32) -> f32 {
-    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
-    // `f32.floor` native instruction, so we can leverage this for both code size
-    // and speed.
-    llvm_intrinsically_optimized! {
-        #[cfg(target_arch = "wasm32")] {
-            return unsafe { ::core::intrinsics::floorf32(x) }
-        }
+    select_implementation! {
+        name: floorf,
+        use_intrinsic: target_arch = "wasm32",
+        args: x,
     }
+
     let mut ui = x.to_bits();
     let e = (((ui >> 23) as i32) & 0xff) - 0x7f;
 
diff --git a/library/compiler-builtins/libm/src/math/mod.rs b/library/compiler-builtins/libm/src/math/mod.rs
index a7e16bfc865c..393bc5150938 100644
--- a/library/compiler-builtins/libm/src/math/mod.rs
+++ b/library/compiler-builtins/libm/src/math/mod.rs
@@ -74,18 +74,6 @@ macro_rules! div {
     };
 }
 
-// FIXME: phase this out, to be replaced by the more flexible `select_implementation`
-macro_rules! llvm_intrinsically_optimized {
-    (#[cfg($($clause:tt)*)] $e:expr) => {
-        #[cfg(all(intrinsics_enabled, not(feature = "force-soft-floats"), $($clause)*))]
-        {
-            if true { // thwart the dead code lint
-                $e
-            }
-        }
-    };
-}
-
 // Private modules
 #[macro_use]
 mod support;
diff --git a/library/compiler-builtins/libm/src/math/sqrt.rs b/library/compiler-builtins/libm/src/math/sqrt.rs
index e2907384dcdb..2e856100f7dd 100644
--- a/library/compiler-builtins/libm/src/math/sqrt.rs
+++ b/library/compiler-builtins/libm/src/math/sqrt.rs
@@ -81,18 +81,12 @@ use core::f64;
 /// The square root of `x` (f64).
 #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
 pub fn sqrt(x: f64) -> f64 {
-    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
-    // `f64.sqrt` native instruction, so we can leverage this for both code size
-    // and speed.
-    llvm_intrinsically_optimized! {
-        #[cfg(target_arch = "wasm32")] {
-            return if x < 0.0 {
-                f64::NAN
-            } else {
-                unsafe { ::core::intrinsics::sqrtf64(x) }
-            }
-        }
+    select_implementation! {
+        name: sqrt,
+        use_intrinsic: target_arch = "wasm32",
+        args: x,
     }
+
     #[cfg(all(target_feature = "sse2", not(feature = "force-soft-floats")))]
     {
         // Note: This path is unlikely since LLVM will usually have already
diff --git a/library/compiler-builtins/libm/src/math/sqrtf.rs b/library/compiler-builtins/libm/src/math/sqrtf.rs
index a738fc0b663a..b2996b350cb4 100644
--- a/library/compiler-builtins/libm/src/math/sqrtf.rs
+++ b/library/compiler-builtins/libm/src/math/sqrtf.rs
@@ -16,18 +16,12 @@
 /// The square root of `x` (f32).
 #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
 pub fn sqrtf(x: f32) -> f32 {
-    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
-    // `f32.sqrt` native instruction, so we can leverage this for both code size
-    // and speed.
-    llvm_intrinsically_optimized! {
-        #[cfg(target_arch = "wasm32")] {
-            return if x < 0.0 {
-                ::core::f32::NAN
-            } else {
-                unsafe { ::core::intrinsics::sqrtf32(x) }
-            }
-        }
+    select_implementation! {
+        name: sqrtf,
+        use_intrinsic: target_arch = "wasm32",
+        args: x,
     }
+
     #[cfg(all(target_feature = "sse", not(feature = "force-soft-floats")))]
     {
         // Note: This path is unlikely since LLVM will usually have already
diff --git a/library/compiler-builtins/libm/src/math/trunc.rs b/library/compiler-builtins/libm/src/math/trunc.rs
index f7892a2c5536..6961bb950600 100644
--- a/library/compiler-builtins/libm/src/math/trunc.rs
+++ b/library/compiler-builtins/libm/src/math/trunc.rs
@@ -2,14 +2,12 @@ use core::f64;
 
 #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
 pub fn trunc(x: f64) -> f64 {
-    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
-    // `f64.trunc` native instruction, so we can leverage this for both code size
-    // and speed.
-    llvm_intrinsically_optimized! {
-        #[cfg(target_arch = "wasm32")] {
-            return unsafe { ::core::intrinsics::truncf64(x) }
-        }
+    select_implementation! {
+        name: trunc,
+        use_intrinsic: target_arch = "wasm32",
+        args: x,
     }
+
     let x1p120 = f64::from_bits(0x4770000000000000); // 0x1p120f === 2 ^ 120
 
     let mut i: u64 = x.to_bits();
diff --git a/library/compiler-builtins/libm/src/math/truncf.rs b/library/compiler-builtins/libm/src/math/truncf.rs
index 20d5b73bd675..8270c8eb392d 100644
--- a/library/compiler-builtins/libm/src/math/truncf.rs
+++ b/library/compiler-builtins/libm/src/math/truncf.rs
@@ -2,14 +2,12 @@ use core::f32;
 
 #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
 pub fn truncf(x: f32) -> f32 {
-    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
-    // `f32.trunc` native instruction, so we can leverage this for both code size
-    // and speed.
-    llvm_intrinsically_optimized! {
-        #[cfg(target_arch = "wasm32")] {
-            return unsafe { ::core::intrinsics::truncf32(x) }
-        }
+    select_implementation! {
+        name: truncf,
+        use_intrinsic: target_arch = "wasm32",
+        args: x,
     }
+
     let x1p120 = f32::from_bits(0x7b800000); // 0x1p120f === 2 ^ 120
 
     let mut i: u32 = x.to_bits();