From 212d463e89b0b2db7560fde4216b0c48a5448f34 Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Wed, 12 Feb 2025 10:12:24 +0000
Subject: [PATCH 1/4] fma refactor 1/3: remove math/fma.rs

Done in stages so git tracks the moved file correctly.
---
 .../compiler-builtins/libm/src/math/fma.rs    | 40 -------------------
 1 file changed, 40 deletions(-)
 delete mode 100644 library/compiler-builtins/libm/src/math/fma.rs

diff --git a/library/compiler-builtins/libm/src/math/fma.rs b/library/compiler-builtins/libm/src/math/fma.rs
deleted file mode 100644
index 69cc3eb6726d..000000000000
--- a/library/compiler-builtins/libm/src/math/fma.rs
+++ /dev/null
@@ -1,40 +0,0 @@
-/// Fused multiply add (f64)
-///
-/// Computes `(x*y)+z`, rounded as one ternary operation (i.e. calculated with infinite precision).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn fma(x: f64, y: f64, z: f64) -> f64 {
-    return super::generic::fma(x, y, z);
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    #[test]
-    fn fma_segfault() {
-        // These two inputs cause fma to segfault on release due to overflow:
-        assert_eq!(
-            fma(
-                -0.0000000000000002220446049250313,
-                -0.0000000000000002220446049250313,
-                -0.0000000000000002220446049250313
-            ),
-            -0.00000000000000022204460492503126,
-        );
-
-        let result = fma(-0.992, -0.992, -0.992);
-        //force rounding to storage format on x87 to prevent superious errors.
-        #[cfg(all(target_arch = "x86", not(target_feature = "sse2")))]
-        let result = force_eval!(result);
-        assert_eq!(result, -0.007936000000000007,);
-    }
-
-    #[test]
-    fn fma_sbb() {
-        assert_eq!(fma(-(1.0 - f64::EPSILON), f64::MIN, f64::MIN), -3991680619069439e277);
-    }
-
-    #[test]
-    fn fma_underflow() {
-        assert_eq!(fma(1.1102230246251565e-16, -9.812526705433188e-305, 1.0894e-320), 0.0,);
-    }
-}

From bcbdb0b74f0acb339fb49902fe0ebecc4f2c3509 Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Wed, 12 Feb 2025 10:12:24 +0000
Subject: [PATCH 2/4] fma refactor 2/3: move math/generic/fma.rs to math/fma.rs

Done in stages so git tracks the moved file correctly.
---
 library/compiler-builtins/libm/src/math/{generic => }/fma.rs | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename library/compiler-builtins/libm/src/math/{generic => }/fma.rs (100%)

diff --git a/library/compiler-builtins/libm/src/math/generic/fma.rs b/library/compiler-builtins/libm/src/math/fma.rs
similarity index 100%
rename from library/compiler-builtins/libm/src/math/generic/fma.rs
rename to library/compiler-builtins/libm/src/math/fma.rs

From 720ba18931628f28fb690a2936e53a32233f88d5 Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Wed, 12 Feb 2025 09:55:04 +0000
Subject: [PATCH 3/4] fma refactor 3/3: combine `fma` public API with its
 implementation

Similar to other recent changes, just put public API in the same file as
its generic implementation. To keep things slightly cleaner, split the
default implementation from the `_wide` implementation.

Also introduces a stub `fmaf16`.
---
 .../libm/etc/function-definitions.json        |   9 +-
 .../compiler-builtins/libm/src/math/fma.rs    | 140 +++++++-----------
 .../libm/src/math/fma_wide.rs                 |  97 ++++++++++++
 .../compiler-builtins/libm/src/math/fmaf.rs   |  21 ---
 .../libm/src/math/fmaf128.rs                  |   7 -
 .../libm/src/math/generic/mod.rs              |   2 -
 .../compiler-builtins/libm/src/math/mod.rs    |  10 +-
 7 files changed, 161 insertions(+), 125 deletions(-)
 create mode 100644 library/compiler-builtins/libm/src/math/fma_wide.rs
 delete mode 100644 library/compiler-builtins/libm/src/math/fmaf.rs
 delete mode 100644 library/compiler-builtins/libm/src/math/fmaf128.rs

diff --git a/library/compiler-builtins/libm/etc/function-definitions.json b/library/compiler-builtins/libm/etc/function-definitions.json
index 63d9927ad6f8..a966852b1128 100644
--- a/library/compiler-builtins/libm/etc/function-definitions.json
+++ b/library/compiler-builtins/libm/etc/function-definitions.json
@@ -343,22 +343,19 @@
     },
     "fma": {
         "sources": [
-            "src/math/fma.rs",
-            "src/math/generic/fma.rs"
+            "src/math/fma.rs"
         ],
         "type": "f64"
     },
     "fmaf": {
         "sources": [
-            "src/math/fmaf.rs",
-            "src/math/generic/fma.rs"
+            "src/math/fma_wide.rs"
         ],
         "type": "f32"
     },
     "fmaf128": {
         "sources": [
-            "src/math/fmaf128.rs",
-            "src/math/generic/fma.rs"
+            "src/math/fma.rs"
         ],
         "type": "f128"
     },
diff --git a/library/compiler-builtins/libm/src/math/fma.rs b/library/compiler-builtins/libm/src/math/fma.rs
index cb1061cc38b1..a54984c936b7 100644
--- a/library/compiler-builtins/libm/src/math/fma.rs
+++ b/library/compiler-builtins/libm/src/math/fma.rs
@@ -1,23 +1,28 @@
 /* SPDX-License-Identifier: MIT */
-/* origin: musl src/math/{fma,fmaf}.c. Ported to generic Rust algorithm in 2025, TG. */
+/* origin: musl src/math/fma.c. Ported to generic Rust algorithm in 2025, TG. */
 
 use super::super::support::{DInt, FpResult, HInt, IntTy, Round, Status};
-use super::super::{CastFrom, CastInto, DFloat, Float, HFloat, Int, MinInt};
+use super::{CastFrom, CastInto, Float, Int, MinInt};
 
-/// Fused multiply-add that works when there is not a larger float size available. Currently this
-/// is still specialized only for `f64`. Computes `(x * y) + z`.
+/// Fused multiply add (f64)
+///
+/// Computes `(x*y)+z`, rounded as one ternary operation (i.e. calculated with infinite precision).
 #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn fma<F>(x: F, y: F, z: F) -> F
-where
-    F: Float,
-    F: CastFrom<F::SignedInt>,
-    F: CastFrom<i8>,
-    F::Int: HInt,
-    u32: CastInto<F::Int>,
-{
+pub fn fma(x: f64, y: f64, z: f64) -> f64 {
     fma_round(x, y, z, Round::Nearest).val
 }
 
+/// Fused multiply add (f128)
+///
+/// Computes `(x*y)+z`, rounded as one ternary operation (i.e. calculated with infinite precision).
+#[cfg(f128_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fmaf128(x: f128, y: f128, z: f128) -> f128 {
+    fma_round(x, y, z, Round::Nearest).val
+}
+
+/// Fused multiply-add that works when there is not a larger float size available. Computes
+/// `(x * y) + z`.
 pub fn fma_round<F>(x: F, y: F, z: F, _round: Round) -> FpResult<F>
 where
     F: Float,
@@ -222,79 +227,7 @@ where
     }
 
     // Use our exponent to scale the final value.
-    FpResult::new(super::scalbn(r, e), status)
-}
-
-/// Fma implementation when a hardware-backed larger float type is available. For `f32` and `f64`,
-/// `f64` has enough precision to represent the `f32` in its entirety, except for double rounding.
-pub fn fma_wide<F, B>(x: F, y: F, z: F) -> F
-where
-    F: Float + HFloat<D = B>,
-    B: Float + DFloat<H = F>,
-    B::Int: CastInto<i32>,
-    i32: CastFrom<i32>,
-{
-    fma_wide_round(x, y, z, Round::Nearest).val
-}
-
-pub fn fma_wide_round<F, B>(x: F, y: F, z: F, round: Round) -> FpResult<F>
-where
-    F: Float + HFloat<D = B>,
-    B: Float + DFloat<H = F>,
-    B::Int: CastInto<i32>,
-    i32: CastFrom<i32>,
-{
-    let one = IntTy::<B>::ONE;
-
-    let xy: B = x.widen() * y.widen();
-    let mut result: B = xy + z.widen();
-    let mut ui: B::Int = result.to_bits();
-    let re = result.ex();
-    let zb: B = z.widen();
-
-    let prec_diff = B::SIG_BITS - F::SIG_BITS;
-    let excess_prec = ui & ((one << prec_diff) - one);
-    let halfway = one << (prec_diff - 1);
-
-    // Common case: the larger precision is fine if...
-    // This is not a halfway case
-    if excess_prec != halfway
-        // Or the result is NaN
-        || re == B::EXP_SAT
-        // Or the result is exact
-        || (result - xy == zb && result - zb == xy)
-        // Or the mode is something other than round to nearest
-        || round != Round::Nearest
-    {
-        let min_inexact_exp = (B::EXP_BIAS as i32 + F::EXP_MIN_SUBNORM) as u32;
-        let max_inexact_exp = (B::EXP_BIAS as i32 + F::EXP_MIN) as u32;
-
-        let mut status = Status::OK;
-
-        if (min_inexact_exp..max_inexact_exp).contains(&re) && status.inexact() {
-            // This branch is never hit; requires previous operations to set a status
-            status.set_inexact(false);
-
-            result = xy + z.widen();
-            if status.inexact() {
-                status.set_underflow(true);
-            } else {
-                status.set_inexact(true);
-            }
-        }
-
-        return FpResult { val: result.narrow(), status };
-    }
-
-    let neg = ui >> (B::BITS - 1) != IntTy::<B>::ZERO;
-    let err = if neg == (zb > xy) { xy - result + zb } else { zb - result + xy };
-    if neg == (err < B::ZERO) {
-        ui += one;
-    } else {
-        ui -= one;
-    }
-
-    FpResult::ok(B::from_bits(ui).narrow())
+    FpResult::new(super::generic::scalbn(r, e), status)
 }
 
 /// Representation of `F` that has handled subnormals.
@@ -363,6 +296,7 @@ impl<F: Float> Norm<F> {
 mod tests {
     use super::*;
 
+    /// Test the generic `fma_round` algorithm for a given float.
     fn spec_test<F>()
     where
         F: Float,
@@ -375,6 +309,8 @@ mod tests {
         let y = F::from_bits(F::Int::ONE);
         let z = F::ZERO;
 
+        let fma = |x, y, z| fma_round(x, y, z, Round::Nearest).val;
+
         // 754-2020 says "When the exact result of (a × b) + c is non-zero yet the result of
         // fusedMultiplyAdd is zero because of rounding, the zero result takes the sign of the
         // exact result"
@@ -384,6 +320,11 @@ mod tests {
         assert_biteq!(fma(-x, -y, z), F::ZERO);
     }
 
+    #[test]
+    fn spec_test_f32() {
+        spec_test::<f32>();
+    }
+
     #[test]
     fn spec_test_f64() {
         spec_test::<f64>();
@@ -417,4 +358,33 @@ mod tests {
     fn spec_test_f128() {
         spec_test::<f128>();
     }
+
+    #[test]
+    fn fma_segfault() {
+        // These two inputs cause fma to segfault on release due to overflow:
+        assert_eq!(
+            fma(
+                -0.0000000000000002220446049250313,
+                -0.0000000000000002220446049250313,
+                -0.0000000000000002220446049250313
+            ),
+            -0.00000000000000022204460492503126,
+        );
+
+        let result = fma(-0.992, -0.992, -0.992);
+        //force rounding to storage format on x87 to prevent superious errors.
+        #[cfg(all(target_arch = "x86", not(target_feature = "sse2")))]
+        let result = force_eval!(result);
+        assert_eq!(result, -0.007936000000000007,);
+    }
+
+    #[test]
+    fn fma_sbb() {
+        assert_eq!(fma(-(1.0 - f64::EPSILON), f64::MIN, f64::MIN), -3991680619069439e277);
+    }
+
+    #[test]
+    fn fma_underflow() {
+        assert_eq!(fma(1.1102230246251565e-16, -9.812526705433188e-305, 1.0894e-320), 0.0,);
+    }
 }
diff --git a/library/compiler-builtins/libm/src/math/fma_wide.rs b/library/compiler-builtins/libm/src/math/fma_wide.rs
new file mode 100644
index 000000000000..a8c1a548879d
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/fma_wide.rs
@@ -0,0 +1,97 @@
+/* SPDX-License-Identifier: MIT */
+/* origin: musl src/math/fmaf.c. Ported to generic Rust algorithm in 2025, TG. */
+
+use super::super::support::{FpResult, IntTy, Round, Status};
+use super::{CastFrom, CastInto, DFloat, Float, HFloat, MinInt};
+
+// Placeholder so we can have `fmaf16` in the `Float` trait.
+#[allow(unused)]
+#[cfg(f16_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub(crate) fn fmaf16(_x: f16, _y: f16, _z: f16) -> f16 {
+    unimplemented!()
+}
+
+/// Floating multiply add (f32)
+///
+/// Computes `(x*y)+z`, rounded as one ternary operation (i.e. calculated with infinite precision).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fmaf(x: f32, y: f32, z: f32) -> f32 {
+    fma_wide_round(x, y, z, Round::Nearest).val
+}
+
+/// Fma implementation when a hardware-backed larger float type is available. For `f32` and `f64`,
+/// `f64` has enough precision to represent the `f32` in its entirety, except for double rounding.
+pub fn fma_wide_round<F, B>(x: F, y: F, z: F, round: Round) -> FpResult<F>
+where
+    F: Float + HFloat<D = B>,
+    B: Float + DFloat<H = F>,
+    B::Int: CastInto<i32>,
+    i32: CastFrom<i32>,
+{
+    let one = IntTy::<B>::ONE;
+
+    let xy: B = x.widen() * y.widen();
+    let mut result: B = xy + z.widen();
+    let mut ui: B::Int = result.to_bits();
+    let re = result.ex();
+    let zb: B = z.widen();
+
+    let prec_diff = B::SIG_BITS - F::SIG_BITS;
+    let excess_prec = ui & ((one << prec_diff) - one);
+    let halfway = one << (prec_diff - 1);
+
+    // Common case: the larger precision is fine if...
+    // This is not a halfway case
+    if excess_prec != halfway
+        // Or the result is NaN
+        || re == B::EXP_SAT
+        // Or the result is exact
+        || (result - xy == zb && result - zb == xy)
+        // Or the mode is something other than round to nearest
+        || round != Round::Nearest
+    {
+        let min_inexact_exp = (B::EXP_BIAS as i32 + F::EXP_MIN_SUBNORM) as u32;
+        let max_inexact_exp = (B::EXP_BIAS as i32 + F::EXP_MIN) as u32;
+
+        let mut status = Status::OK;
+
+        if (min_inexact_exp..max_inexact_exp).contains(&re) && status.inexact() {
+            // This branch is never hit; requires previous operations to set a status
+            status.set_inexact(false);
+
+            result = xy + z.widen();
+            if status.inexact() {
+                status.set_underflow(true);
+            } else {
+                status.set_inexact(true);
+            }
+        }
+
+        return FpResult { val: result.narrow(), status };
+    }
+
+    let neg = ui >> (B::BITS - 1) != IntTy::<B>::ZERO;
+    let err = if neg == (zb > xy) { xy - result + zb } else { zb - result + xy };
+    if neg == (err < B::ZERO) {
+        ui += one;
+    } else {
+        ui -= one;
+    }
+
+    FpResult::ok(B::from_bits(ui).narrow())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn issue_263() {
+        let a = f32::from_bits(1266679807);
+        let b = f32::from_bits(1300234242);
+        let c = f32::from_bits(1115553792);
+        let expected = f32::from_bits(1501560833);
+        assert_eq!(fmaf(a, b, c), expected);
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/fmaf.rs b/library/compiler-builtins/libm/src/math/fmaf.rs
deleted file mode 100644
index 40d7f40d6173..000000000000
--- a/library/compiler-builtins/libm/src/math/fmaf.rs
+++ /dev/null
@@ -1,21 +0,0 @@
-/// Floating multiply add (f32)
-///
-/// Computes `(x*y)+z`, rounded as one ternary operation:
-/// Computes the value (as if) to infinite precision and rounds once to the result format,
-/// according to the rounding mode characterized by the value of FLT_ROUNDS.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn fmaf(x: f32, y: f32, z: f32) -> f32 {
-    super::generic::fma_wide(x, y, z)
-}
-
-#[cfg(test)]
-mod tests {
-    #[test]
-    fn issue_263() {
-        let a = f32::from_bits(1266679807);
-        let b = f32::from_bits(1300234242);
-        let c = f32::from_bits(1115553792);
-        let expected = f32::from_bits(1501560833);
-        assert_eq!(super::fmaf(a, b, c), expected);
-    }
-}
diff --git a/library/compiler-builtins/libm/src/math/fmaf128.rs b/library/compiler-builtins/libm/src/math/fmaf128.rs
deleted file mode 100644
index 50f7360deb45..000000000000
--- a/library/compiler-builtins/libm/src/math/fmaf128.rs
+++ /dev/null
@@ -1,7 +0,0 @@
-/// Fused multiply add (f128)
-///
-/// Computes `(x*y)+z`, rounded as one ternary operation (i.e. calculated with infinite precision).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn fmaf128(x: f128, y: f128, z: f128) -> f128 {
-    return super::generic::fma(x, y, z);
-}
diff --git a/library/compiler-builtins/libm/src/math/generic/mod.rs b/library/compiler-builtins/libm/src/math/generic/mod.rs
index f224eba731c3..9be185f809f1 100644
--- a/library/compiler-builtins/libm/src/math/generic/mod.rs
+++ b/library/compiler-builtins/libm/src/math/generic/mod.rs
@@ -3,7 +3,6 @@ mod copysign;
 mod fabs;
 mod fdim;
 mod floor;
-mod fma;
 mod fmax;
 mod fmaximum;
 mod fmaximum_num;
@@ -22,7 +21,6 @@ pub use copysign::copysign;
 pub use fabs::fabs;
 pub use fdim::fdim;
 pub use floor::floor;
-pub use fma::{fma, fma_wide};
 pub use fmax::fmax;
 pub use fmaximum::fmaximum;
 pub use fmaximum_num::fmaximum_num;
diff --git a/library/compiler-builtins/libm/src/math/mod.rs b/library/compiler-builtins/libm/src/math/mod.rs
index e58d79adc419..5fc8fa0b3cd0 100644
--- a/library/compiler-builtins/libm/src/math/mod.rs
+++ b/library/compiler-builtins/libm/src/math/mod.rs
@@ -164,7 +164,7 @@ mod fdimf;
 mod floor;
 mod floorf;
 mod fma;
-mod fmaf;
+mod fma_wide;
 mod fmin_fmax;
 mod fminimum_fmaximum;
 mod fminimum_fmaximum_num;
@@ -271,7 +271,7 @@ pub use self::fdimf::fdimf;
 pub use self::floor::floor;
 pub use self::floorf::floorf;
 pub use self::fma::fma;
-pub use self::fmaf::fmaf;
+pub use self::fma_wide::fmaf;
 pub use self::fmin_fmax::{fmax, fmaxf, fmin, fminf};
 pub use self::fminimum_fmaximum::{fmaximum, fmaximumf, fminimum, fminimumf};
 pub use self::fminimum_fmaximum_num::{fmaximum_num, fmaximum_numf, fminimum_num, fminimum_numf};
@@ -370,6 +370,9 @@ cfg_if! {
         pub use self::sqrtf16::sqrtf16;
         pub use self::truncf16::truncf16;
         // verify-sorted-end
+
+        #[allow(unused_imports)]
+        pub(crate) use self::fma_wide::fmaf16;
     }
 }
 
@@ -381,7 +384,6 @@ cfg_if! {
         mod fabsf128;
         mod fdimf128;
         mod floorf128;
-        mod fmaf128;
         mod fmodf128;
         mod ldexpf128;
         mod roundf128;
@@ -396,7 +398,7 @@ cfg_if! {
         pub use self::fabsf128::fabsf128;
         pub use self::fdimf128::fdimf128;
         pub use self::floorf128::floorf128;
-        pub use self::fmaf128::fmaf128;
+        pub use self::fma::fmaf128;
         pub use self::fmin_fmax::{fmaxf128, fminf128};
         pub use self::fminimum_fmaximum::{fmaximumf128, fminimumf128};
         pub use self::fminimum_fmaximum_num::{fmaximum_numf128, fminimum_numf128};

From c01153d29b1917d0d1c805f98a751f3ecaf5952b Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Wed, 12 Feb 2025 10:16:48 +0000
Subject: [PATCH 4/4] Make `fma` a trait method on `Float`

---
 .../libm/crates/libm-test/src/f8_impl.rs      |  4 +++
 .../libm/etc/function-definitions.json        |  3 +--
 .../libm/etc/update-api-list.py               |  2 +-
 .../compiler-builtins/libm/src/math/cbrt.rs   | 20 +++-----------
 .../libm/src/math/support/float_traits.rs     | 26 ++++++++++++++-----
 5 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/library/compiler-builtins/libm/crates/libm-test/src/f8_impl.rs b/library/compiler-builtins/libm/crates/libm-test/src/f8_impl.rs
index 56ea0b72995c..0683d8392662 100644
--- a/library/compiler-builtins/libm/crates/libm-test/src/f8_impl.rs
+++ b/library/compiler-builtins/libm/crates/libm-test/src/f8_impl.rs
@@ -78,6 +78,10 @@ impl Float for f8 {
         libm::generic::copysign(self, other)
     }
 
+    fn fma(self, _y: Self, _z: Self) -> Self {
+        unimplemented!()
+    }
+
     fn normalize(_significand: Self::Int) -> (i32, Self::Int) {
         unimplemented!()
     }
diff --git a/library/compiler-builtins/libm/etc/function-definitions.json b/library/compiler-builtins/libm/etc/function-definitions.json
index a966852b1128..64a775ba9f17 100644
--- a/library/compiler-builtins/libm/etc/function-definitions.json
+++ b/library/compiler-builtins/libm/etc/function-definitions.json
@@ -130,8 +130,7 @@
     "copysign": {
         "sources": [
             "src/math/copysign.rs",
-            "src/math/generic/copysign.rs",
-            "src/math/support/float_traits.rs"
+            "src/math/generic/copysign.rs"
         ],
         "type": "f64"
     },
diff --git a/library/compiler-builtins/libm/etc/update-api-list.py b/library/compiler-builtins/libm/etc/update-api-list.py
index c0b6e41d3002..67d1b0508616 100755
--- a/library/compiler-builtins/libm/etc/update-api-list.py
+++ b/library/compiler-builtins/libm/etc/update-api-list.py
@@ -24,7 +24,7 @@ ROOT_DIR = ETC_DIR.parent
 DIRECTORIES = [".github", "ci", "crates", "etc", "src"]
 
 # These files do not trigger a retest.
-IGNORED_SOURCES = ["src/libm_helper.rs"]
+IGNORED_SOURCES = ["src/libm_helper.rs", "src/math/support/float_traits.rs"]
 
 IndexTy: TypeAlias = dict[str, dict[str, Any]]
 """Type of the `index` item in rustdoc's JSON output"""
diff --git a/library/compiler-builtins/libm/src/math/cbrt.rs b/library/compiler-builtins/libm/src/math/cbrt.rs
index 8560d37abf32..9d3311cd6a87 100644
--- a/library/compiler-builtins/libm/src/math/cbrt.rs
+++ b/library/compiler-builtins/libm/src/math/cbrt.rs
@@ -103,11 +103,11 @@ pub fn cbrt_round(x: f64, round: Round) -> FpResult<f64> {
      * and rr an approximation of 1/zz. We now perform another iteration of
      * Newton-Raphson, this time with a linear approximation only. */
     y2 = y * y;
-    let mut y2l: f64 = fmaf64(y, y, -y2);
+    let mut y2l: f64 = y.fma(y, -y2);
 
     /* y2 + y2l = y^2 exactly */
     let mut y3: f64 = y2 * y;
-    let mut y3l: f64 = fmaf64(y, y2, -y3) + y * y2l;
+    let mut y3l: f64 = y.fma(y2, -y3) + y * y2l;
 
     /* y3 + y3l approximates y^3 with about 106 bits of accuracy */
     h = ((y3 - zz) + y3l) * rr;
@@ -132,9 +132,9 @@ pub fn cbrt_round(x: f64, round: Round) -> FpResult<f64> {
         cold_path();
 
         y2 = y1 * y1;
-        y2l = fmaf64(y1, y1, -y2);
+        y2l = y1.fma(y1, -y2);
         y3 = y2 * y1;
-        y3l = fmaf64(y1, y2, -y3) + y1 * y2l;
+        y3l = y1.fma(y2, -y3) + y1 * y2l;
         h = ((y3 - zz) + y3l) * rr;
         dy = h * (y1 * u0);
         y = y1 - dy;
@@ -198,18 +198,6 @@ pub fn cbrt_round(x: f64, round: Round) -> FpResult<f64> {
     FpResult::ok(f64::from_bits(cvt3))
 }
 
-fn fmaf64(x: f64, y: f64, z: f64) -> f64 {
-    #[cfg(intrinsics_enabled)]
-    {
-        return unsafe { core::intrinsics::fmaf64(x, y, z) };
-    }
-
-    #[cfg(not(intrinsics_enabled))]
-    {
-        return super::fma(x, y, z);
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/library/compiler-builtins/libm/src/math/support/float_traits.rs b/library/compiler-builtins/libm/src/math/support/float_traits.rs
index 534ca9a07fa7..96c209c852b1 100644
--- a/library/compiler-builtins/libm/src/math/support/float_traits.rs
+++ b/library/compiler-builtins/libm/src/math/support/float_traits.rs
@@ -160,9 +160,11 @@ pub trait Float:
     fn abs(self) -> Self;
 
     /// Returns a number composed of the magnitude of self and the sign of sign.
-    #[allow(dead_code)]
     fn copysign(self, other: Self) -> Self;
 
+    /// Fused multiply add, rounding once.
+    fn fma(self, y: Self, z: Self) -> Self;
+
     /// Returns (normalized exponent, normalized significand)
     #[allow(dead_code)]
     fn normalize(significand: Self::Int) -> (i32, Self::Int);
@@ -184,7 +186,9 @@ macro_rules! float_impl {
         $sity:ident,
         $bits:expr,
         $significand_bits:expr,
-        $from_bits:path
+        $from_bits:path,
+        $fma_fn:ident,
+        $fma_intrinsic:ident
     ) => {
         impl Float for $ty {
             type Int = $ity;
@@ -252,6 +256,16 @@ macro_rules! float_impl {
                     }
                 }
             }
+            fn fma(self, y: Self, z: Self) -> Self {
+                cfg_if! {
+                    // fma is not yet available in `core`
+                    if #[cfg(intrinsics_enabled)] {
+                        unsafe{ core::intrinsics::$fma_intrinsic(self, y, z) }
+                    } else {
+                        super::super::$fma_fn(self, y, z)
+                    }
+                }
+            }
             fn normalize(significand: Self::Int) -> (i32, Self::Int) {
                 let shift = significand.leading_zeros().wrapping_sub(Self::EXP_BITS);
                 (1i32.wrapping_sub(shift as i32), significand << shift as Self::Int)
@@ -261,11 +275,11 @@ macro_rules! float_impl {
 }
 
 #[cfg(f16_enabled)]
-float_impl!(f16, u16, i16, 16, 10, f16::from_bits);
-float_impl!(f32, u32, i32, 32, 23, f32_from_bits);
-float_impl!(f64, u64, i64, 64, 52, f64_from_bits);
+float_impl!(f16, u16, i16, 16, 10, f16::from_bits, fmaf16, fmaf16);
+float_impl!(f32, u32, i32, 32, 23, f32_from_bits, fmaf, fmaf32);
+float_impl!(f64, u64, i64, 64, 52, f64_from_bits, fma, fmaf64);
 #[cfg(f128_enabled)]
-float_impl!(f128, u128, i128, 128, 112, f128::from_bits);
+float_impl!(f128, u128, i128, 128, 112, f128::from_bits, fmaf128, fmaf128);
 
 /* FIXME(msrv): vendor some things that are not const stable at our MSRV */