From 9c98c461473a26354c519702d79a689dda7f9645 Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Wed, 15 Jan 2025 11:34:17 +0000
Subject: [PATCH 1/5] Don't set `opt_level` in the musl build script

`cc` automatically reads this from Cargo's `OPT_LEVEL` variable so we
don't need to set it explicitly. Remove this so running in a debugger
makes more sense.
---
 library/compiler-builtins/libm/crates/musl-math-sys/build.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/library/compiler-builtins/libm/crates/musl-math-sys/build.rs b/library/compiler-builtins/libm/crates/musl-math-sys/build.rs
index 03deb4ff0ac0..d75748159cad 100644
--- a/library/compiler-builtins/libm/crates/musl-math-sys/build.rs
+++ b/library/compiler-builtins/libm/crates/musl-math-sys/build.rs
@@ -151,7 +151,6 @@ fn build_musl_math(cfg: &Config) {
         .flag_if_supported("-ffreestanding")
         .flag_if_supported("-nostdinc")
         .define("_ALL_SOURCE", "1")
-        .opt_level(3)
         .define(
             "ROOT_INCLUDE_FEATURES",
             Some(musl_dir.join("include/features.h").to_str().unwrap()),

From 8927014e919b789ef957d82ed13eaebe33922e29 Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Wed, 22 Jan 2025 05:29:36 +0000
Subject: [PATCH 2/5] Enable `force-soft-floats` for extensive tests

Any architecture-specific float operations are likely to consist of only
a few instructions, but the softfloat implementations are much more
complex. Ensure this is what gets tested.
---
 library/compiler-builtins/libm/.github/workflows/main.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/compiler-builtins/libm/.github/workflows/main.yaml b/library/compiler-builtins/libm/.github/workflows/main.yaml
index 7693de6559a1..89c5facef44d 100644
--- a/library/compiler-builtins/libm/.github/workflows/main.yaml
+++ b/library/compiler-builtins/libm/.github/workflows/main.yaml
@@ -270,7 +270,7 @@ jobs:
           fi
 
           LIBM_EXTENSIVE_TESTS="$CHANGED" cargo t \
-            --features build-mpfr,unstable \
+            --features build-mpfr,unstable,force-soft-floats \
             --profile release-checked \
             -- extensive
       - name: Print test logs if available

From 573ded2ee8a59d3f3bb2a56477fc939b0922cf44 Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Sun, 12 Jan 2025 11:45:40 +0000
Subject: [PATCH 3/5] Port the most recent version of Musl's `sqrt` as a
 generic algorithm

Musl commit 97e9b73d59 ("math: new software sqrt") adds a new algorithm
using Goldschmidt division. Port this algorithm to Rust and make it
generic, which shows a notable performance improvement over the existing
algorithm.

This also allows adding square root routines for `f16` and `f128`.
---
 .../libm/etc/function-definitions.json        |   2 +
 .../libm/src/math/generic/mod.rs              |   2 +
 .../libm/src/math/generic/sqrt.rs             | 419 ++++++++++++++++++
 .../compiler-builtins/libm/src/math/sqrt.rs   | 252 +----------
 .../compiler-builtins/libm/src/math/sqrtf.rs  | 133 +-----
 .../libm/src/math/support/int_traits.rs       |   2 +
 .../libm/src/math/support/macros.rs           |  22 +-
 .../libm/src/math/support/mod.rs              |  11 +
 8 files changed, 450 insertions(+), 393 deletions(-)
 create mode 100644 library/compiler-builtins/libm/src/math/generic/sqrt.rs

diff --git a/library/compiler-builtins/libm/etc/function-definitions.json b/library/compiler-builtins/libm/etc/function-definitions.json
index dbaac931c769..9f7c8ab25a03 100644
--- a/library/compiler-builtins/libm/etc/function-definitions.json
+++ b/library/compiler-builtins/libm/etc/function-definitions.json
@@ -704,6 +704,7 @@
             "src/libm_helper.rs",
             "src/math/arch/i686.rs",
             "src/math/arch/wasm32.rs",
+            "src/math/generic/sqrt.rs",
             "src/math/sqrt.rs"
         ],
         "type": "f64"
@@ -712,6 +713,7 @@
         "sources": [
             "src/math/arch/i686.rs",
             "src/math/arch/wasm32.rs",
+            "src/math/generic/sqrt.rs",
             "src/math/sqrtf.rs"
         ],
         "type": "f32"
diff --git a/library/compiler-builtins/libm/src/math/generic/mod.rs b/library/compiler-builtins/libm/src/math/generic/mod.rs
index 2b068d6c54d3..3b5a2c3effd7 100644
--- a/library/compiler-builtins/libm/src/math/generic/mod.rs
+++ b/library/compiler-builtins/libm/src/math/generic/mod.rs
@@ -1,9 +1,11 @@
 mod copysign;
 mod fabs;
 mod fdim;
+mod sqrt;
 mod trunc;
 
 pub use copysign::copysign;
 pub use fabs::fabs;
 pub use fdim::fdim;
+pub use sqrt::sqrt;
 pub use trunc::trunc;
diff --git a/library/compiler-builtins/libm/src/math/generic/sqrt.rs b/library/compiler-builtins/libm/src/math/generic/sqrt.rs
new file mode 100644
index 000000000000..a2e054f3cf2b
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/generic/sqrt.rs
@@ -0,0 +1,419 @@
+/* SPDX-License-Identifier: MIT */
+/* origin: musl src/math/sqrt.c. Ported to generic Rust algorithm in 2025, TG. */
+
+//! Generic square root algorithm.
+//!
+//! This routine operates around `m_u2`, a U.2 (fixed point with two integral bits) mantissa
+//! within the range [1, 4). A table lookup provides an initial estimate, then goldschmidt
+//! iterations at various widths are used to approach the real values.
+//!
+//! For the iterations, `r` is a U0 number that approaches `1/sqrt(m_u2)`, and `s` is a U2 number
+//! that approaches `sqrt(m_u2)`. Recall that m_u2 ∈ [1, 4).
+//!
+//! With Newton-Raphson iterations, this would be:
+//!
+//! - `w = r * r           w ~ 1 / m`
+//! - `u = 3 - m * w       u ~ 3 - m * w = 3 - m / m = 2`
+//! - `r = r * u / 2       r ~ r`
+//!
+//! (Note that the righthand column does not show anything analytically meaningful (i.e. r ~ r),
+//! since the value of performing one iteration is in reducing the error representable by `~`).
+//!
+//! Instead of Newton-Raphson iterations, Goldschmidt iterations are used to calculate
+//! `s = m * r`:
+//!
+//! - `s = m * r           s ~ m / sqrt(m)`
+//! - `u = 3 - s * r       u ~ 3 - (m / sqrt(m)) * (1 / sqrt(m)) = 3 - m / m = 2`
+//! - `r = r * u / 2       r ~ r`
+//! - `s = s * u / 2       s ~ s`
+//!
+//! The above is precise because it uses the original value `m`. There is also a faster version
+//! that performs fewer steps but does not use `m`:
+//!
+//! - `u = 3 - s * r       u ~ 3 - 1`
+//! - `r = r * u / 2       r ~ r`
+//! - `s = s * u / 2       s ~ s`
+//!
+//! Rounding errors accumulate faster with the second version, so it is only used for subsequent
+//! iterations within the same width integer. The first version is always used for the first
+//! iteration at a new width in order to avoid this accumulation.
+//!
+//! Goldschmidt has the advantage over Newton-Raphson that `sqrt(x)` and `1/sqrt(x)` are
+//! computed at the same time, i.e. there is no need to calculate `1/sqrt(x)` and invert it.
+
+use super::super::support::{IntTy, cold_path, raise_invalid};
+use super::super::{CastFrom, CastInto, DInt, Float, HInt, Int, MinInt};
+
+pub fn sqrt<F>(x: F) -> F
+where
+    F: Float + SqrtHelper,
+    F::Int: HInt,
+    F::Int: From<u8>,
+    F::Int: From<F::ISet2>,
+    F::Int: CastInto<F::ISet1>,
+    F::Int: CastInto<F::ISet2>,
+    u32: CastInto<F::Int>,
+{
+    let zero = IntTy::<F>::ZERO;
+    let one = IntTy::<F>::ONE;
+
+    let mut ix = x.to_bits();
+
+    // Top is the exponent and sign, which may or may not be shifted. If the float fits into a
+    // `u32`, we can get by without paying shifting costs.
+    let noshift = F::BITS <= u32::BITS;
+    let (mut top, special_case) = if noshift {
+        let exp_lsb = one << F::SIG_BITS;
+        let special_case = ix.wrapping_sub(exp_lsb) >= F::EXP_MASK - exp_lsb;
+        (Exp::NoShift(()), special_case)
+    } else {
+        let top = u32::cast_from(ix >> F::SIG_BITS);
+        let special_case = top.wrapping_sub(1) >= F::EXP_MAX - 1;
+        (Exp::Shifted(top), special_case)
+    };
+
+    // Handle NaN, zero, and out of domain (<= 0)
+    if special_case {
+        cold_path();
+
+        // +/-0
+        if ix << 1 == zero {
+            return x;
+        }
+
+        // Positive infinity
+        if ix == F::EXP_MASK {
+            return x;
+        }
+
+        // NaN or negative
+        if ix > F::EXP_MASK {
+            return raise_invalid(x);
+        }
+
+        // Normalize subnormals by multiplying by 1.0 << SIG_BITS (e.g. 0x1p52 for doubles).
+        let scaled = x * F::from_parts(false, (F::SIG_BITS + F::EXP_BIAS) as i32, zero);
+        ix = scaled.to_bits();
+        match top {
+            Exp::Shifted(ref mut v) => {
+                *v = scaled.exp().unsigned();
+                *v = (*v).wrapping_sub(F::SIG_BITS);
+            }
+            Exp::NoShift(()) => {
+                ix = ix.wrapping_sub((F::SIG_BITS << F::SIG_BITS).cast());
+            }
+        }
+    }
+
+    // Reduce arguments such that `x = 4^e * m`:
+    //
+    // - m_u2 ∈ [1, 4), a fixed point U2.BITS number
+    // - 2^e is the exponent part of the result
+    let (m_u2, exp) = match top {
+        Exp::Shifted(top) => {
+            // We now know `x` is positive, so `top` is just its (biased) exponent
+            let mut e = top;
+            // Construct a fixed point representation of the mantissa.
+            let mut m_u2 = (ix | F::IMPLICIT_BIT) << F::EXP_BITS;
+            let even = (e & 1) != 0;
+            if even {
+                m_u2 >>= 1;
+            }
+            e = (e.wrapping_add(F::EXP_MAX >> 1)) >> 1;
+            (m_u2, Exp::Shifted(e))
+        }
+        Exp::NoShift(()) => {
+            let even = ix & (one << F::SIG_BITS) != zero;
+
+            // Exponent part of the return value
+            let mut e_noshift = ix >> 1;
+            // ey &= (F::EXP_MASK << 2) >> 2; // clear the top exponent bit (result = 1.0)
+            e_noshift += (F::EXP_MASK ^ (F::SIGN_MASK >> 1)) >> 1;
+            e_noshift &= F::EXP_MASK;
+
+            let m1 = (ix << F::EXP_BITS) | F::SIGN_MASK;
+            let m0 = (ix << (F::EXP_BITS - 1)) & !F::SIGN_MASK;
+            let m_u2 = if even { m0 } else { m1 };
+
+            (m_u2, Exp::NoShift(e_noshift))
+        }
+    };
+
+    // Extract the top 6 bits of the significand with the lowest bit of the exponent.
+    let i = usize::cast_from(ix >> (F::SIG_BITS - 6)) & 0b1111111;
+
+    // Start with an initial guess for `r = 1 / sqrt(m)` from the table, and shift `m` as an
+    // initial value for `s = sqrt(m)`. See the module documentation for details.
+    let r1_u0: F::ISet1 = F::ISet1::cast_from(RSQRT_TAB[i]) << (F::ISet1::BITS - 16);
+    let s1_u2: F::ISet1 = ((m_u2) >> (F::BITS - F::ISet1::BITS)).cast();
+
+    // Perform iterations, if any, at quarter width (used for `f128`).
+    let (r1_u0, _s1_u2) = goldschmidt::<F, F::ISet1>(r1_u0, s1_u2, F::SET1_ROUNDS, false);
+
+    // Widen values and perform iterations at half width (used for `f64` and `f128`).
+    let r2_u0: F::ISet2 = F::ISet2::from(r1_u0) << (F::ISet2::BITS - F::ISet1::BITS);
+    let s2_u2: F::ISet2 = ((m_u2) >> (F::BITS - F::ISet2::BITS)).cast();
+    let (r2_u0, _s2_u2) = goldschmidt::<F, F::ISet2>(r2_u0, s2_u2, F::SET2_ROUNDS, false);
+
+    // Perform final iterations at full width (used for all float types).
+    let r_u0: F::Int = F::Int::from(r2_u0) << (F::BITS - F::ISet2::BITS);
+    let s_u2: F::Int = m_u2;
+    let (_r_u0, s_u2) = goldschmidt::<F, F::Int>(r_u0, s_u2, F::FINAL_ROUNDS, true);
+
+    // Shift back to mantissa position.
+    let mut m = s_u2 >> (F::EXP_BITS - 2);
+
+    // The musl source includes the following comment (with literals replaced):
+    //
+    // > s < sqrt(m) < s + 0x1.09p-SIG_BITS
+    // > compute nearest rounded result: the nearest result to SIG_BITS bits is either s or
+    // > s+0x1p-SIG_BITS, we can decide by comparing (2^SIG_BITS s + 0.5)^2 to 2^(2*SIG_BITS) m.
+    //
+    // Expanding this with , with `SIG_BITS = p` and adjusting based on the operations done to
+    // `d0` and `d1`:
+    //
+    // - `2^(2p)m ≟ ((2^p)m + 0.5)^2`
+    // - `2^(2p)m ≟ 2^(2p)m^2 + (2^p)m + 0.25`
+    // - `2^(2p)m - m^2 ≟ (2^(2p) - 1)m^2 + (2^p)m + 0.25`
+    // - `(1 - 2^(2p))m + m^2 ≟ (1 - 2^(2p))m^2 + (1 - 2^p)m + 0.25` (?)
+    //
+    // I do not follow how the rounding bit is extracted from this comparison with the below
+    // operations. In any case, the algorithm is well tested.
+
+    // The value needed to shift `m_u2` by to create `m*2^(2p)`. `2p = 2 * F::SIG_BITS`,
+    // `F::BITS - 2` accounts for the offset that `m_u2` already has.
+    let shift = 2 * F::SIG_BITS - (F::BITS - 2);
+
+    // `2^(2p)m - m^2`
+    let d0 = (m_u2 << shift).wrapping_sub(m.wrapping_mul(m));
+    // `m - 2^(2p)m + m^2`
+    let d1 = m.wrapping_sub(d0);
+    m += d1 >> (F::BITS - 1);
+    m &= F::SIG_MASK;
+
+    match exp {
+        Exp::Shifted(e) => m |= IntTy::<F>::cast_from(e) << F::SIG_BITS,
+        Exp::NoShift(e) => m |= e,
+    };
+
+    let mut y = F::from_bits(m);
+
+    // FIXME(f16): the fenv math does not work for `f16`
+    if F::BITS > 16 {
+        // Handle rounding and inexact. `(m + 1)^2 == 2^shift m` is exact; for all other cases, add
+        // a tiny value to cause fenv effects.
+        let d2 = d1.wrapping_add(m).wrapping_add(one);
+        let mut tiny = if d2 == zero {
+            cold_path();
+            zero
+        } else {
+            F::IMPLICIT_BIT
+        };
+
+        tiny |= (d1 ^ d2) & F::SIGN_MASK;
+        let t = F::from_bits(tiny);
+        y = y + t;
+    }
+
+    y
+}
+
+/// Multiply at the wider integer size, returning the high half.
+fn wmulh<I: HInt>(a: I, b: I) -> I {
+    a.widen_mul(b).hi()
+}
+
+/// Perform `count` goldschmidt iterations, returning `(r_u0, s_u?)`.
+///
+/// - `r_u0` is the reciprocal `r ~ 1 / sqrt(m)`, as U0.
+/// - `s_u2` is the square root, `s ~ sqrt(m)`, as U2.
+/// - `count` is the number of iterations to perform.
+/// - `final_set` should be true if this is the last round (same-sized integer). If so, the
+///   returned `s` will be U3, for later shifting. Otherwise, the returned `s` is U2.
+///
+/// Note that performance relies on the optimizer being able to unroll these loops (reasonably
+/// trivial, `count` is a constant when called).
+#[inline]
+fn goldschmidt<F, I>(mut r_u0: I, mut s_u2: I, count: u32, final_set: bool) -> (I, I)
+where
+    F: SqrtHelper,
+    I: HInt + From<u8>,
+{
+    let three_u2 = I::from(0b11u8) << (I::BITS - 2);
+    let mut u_u0 = r_u0;
+
+    for i in 0..count {
+        // First iteration: `s = m*r` (`u_u0 = r_u0` set above)
+        // Subsequent iterations: `s=s*u/2`
+        s_u2 = wmulh(s_u2, u_u0);
+
+        // Perform `s /= 2` if:
+        //
+        // 1. This is not the first iteration (the first iteration is `s = m*r`)...
+        // 2. ... and this is not the last set of iterations
+        // 3. ... or, if this is the last set, it is not the last iteration
+        //
+        // This step is not performed for the final iteration because the shift is combined with
+        // a later shift (moving `s` into the mantissa).
+        if i > 0 && (!final_set || i + 1 < count) {
+            s_u2 <<= 1;
+        }
+
+        // u = 3 - s*r
+        let d_u2 = wmulh(s_u2, r_u0);
+        u_u0 = three_u2.wrapping_sub(d_u2);
+
+        // r = r*u/2
+        r_u0 = wmulh(r_u0, u_u0) << 1;
+    }
+
+    (r_u0, s_u2)
+}
+
+/// Representation of whether we shift the exponent into a `u32`, or modify it in place to save
+/// the shift operations.
+enum Exp<T> {
+    /// The exponent has been shifted to a `u32` and is LSB-aligned.
+    Shifted(u32),
+    /// The exponent is in its natural position in integer repr.
+    NoShift(T),
+}
+
+/// Size-specific constants related to the square root routine.
+pub trait SqrtHelper: Float {
+    /// Integer for the first set of rounds. If unused, set to the same type as the next set.
+    type ISet1: HInt + Into<Self::ISet2> + CastFrom<Self::Int> + From<u8>;
+    /// Integer for the second set of rounds. If unused, set to the same type as the next set.
+    type ISet2: HInt + From<Self::ISet1> + From<u8>;
+
+    /// Number of rounds at `ISet1`.
+    const SET1_ROUNDS: u32 = 0;
+    /// Number of rounds at `ISet2`.
+    const SET2_ROUNDS: u32 = 0;
+    /// Number of rounds at `Self::Int`.
+    const FINAL_ROUNDS: u32;
+}
+
+impl SqrtHelper for f32 {
+    type ISet1 = u32; // unused
+    type ISet2 = u32; // unused
+
+    const FINAL_ROUNDS: u32 = 3;
+}
+
+impl SqrtHelper for f64 {
+    type ISet1 = u32; // unused
+    type ISet2 = u32;
+
+    const SET2_ROUNDS: u32 = 2;
+    const FINAL_ROUNDS: u32 = 2;
+}
+
+/// A U0.16 representation of `1/sqrt(x)`.
+///
+// / The index is a 7-bit number consisting of a single exponent bit and 6 bits of significand.
+#[rustfmt::skip]
+static RSQRT_TAB: [u16; 128] = [
+    0xb451, 0xb2f0, 0xb196, 0xb044, 0xaef9, 0xadb6, 0xac79, 0xab43,
+    0xaa14, 0xa8eb, 0xa7c8, 0xa6aa, 0xa592, 0xa480, 0xa373, 0xa26b,
+    0xa168, 0xa06a, 0x9f70, 0x9e7b, 0x9d8a, 0x9c9d, 0x9bb5, 0x9ad1,
+    0x99f0, 0x9913, 0x983a, 0x9765, 0x9693, 0x95c4, 0x94f8, 0x9430,
+    0x936b, 0x92a9, 0x91ea, 0x912e, 0x9075, 0x8fbe, 0x8f0a, 0x8e59,
+    0x8daa, 0x8cfe, 0x8c54, 0x8bac, 0x8b07, 0x8a64, 0x89c4, 0x8925,
+    0x8889, 0x87ee, 0x8756, 0x86c0, 0x862b, 0x8599, 0x8508, 0x8479,
+    0x83ec, 0x8361, 0x82d8, 0x8250, 0x81c9, 0x8145, 0x80c2, 0x8040,
+    0xff02, 0xfd0e, 0xfb25, 0xf947, 0xf773, 0xf5aa, 0xf3ea, 0xf234,
+    0xf087, 0xeee3, 0xed47, 0xebb3, 0xea27, 0xe8a3, 0xe727, 0xe5b2,
+    0xe443, 0xe2dc, 0xe17a, 0xe020, 0xdecb, 0xdd7d, 0xdc34, 0xdaf1,
+    0xd9b3, 0xd87b, 0xd748, 0xd61a, 0xd4f1, 0xd3cd, 0xd2ad, 0xd192,
+    0xd07b, 0xcf69, 0xce5b, 0xcd51, 0xcc4a, 0xcb48, 0xca4a, 0xc94f,
+    0xc858, 0xc764, 0xc674, 0xc587, 0xc49d, 0xc3b7, 0xc2d4, 0xc1f4,
+    0xc116, 0xc03c, 0xbf65, 0xbe90, 0xbdbe, 0xbcef, 0xbc23, 0xbb59,
+    0xba91, 0xb9cc, 0xb90a, 0xb84a, 0xb78c, 0xb6d0, 0xb617, 0xb560,
+];
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Test against edge cases from https://en.cppreference.com/w/cpp/numeric/math/sqrt
+    fn spec_test<F>()
+    where
+        F: Float + SqrtHelper,
+        F::Int: HInt,
+        F::Int: From<u8>,
+        F::Int: From<F::ISet2>,
+        F::Int: CastInto<F::ISet1>,
+        F::Int: CastInto<F::ISet2>,
+        u32: CastInto<F::Int>,
+    {
+        // Not Asserted: FE_INVALID exception is raised if argument is negative.
+        assert!(sqrt(F::NEG_ONE).is_nan());
+        assert!(sqrt(F::NAN).is_nan());
+        for f in [F::ZERO, F::NEG_ZERO, F::INFINITY].iter().copied() {
+            assert_biteq!(sqrt(f), f);
+        }
+    }
+
+    #[test]
+    fn sanity_check_f32() {
+        assert_biteq!(sqrt(100.0f32), 10.0);
+        assert_biteq!(sqrt(4.0f32), 2.0);
+    }
+
+    #[test]
+    fn spec_tests_f32() {
+        spec_test::<f32>();
+    }
+
+    #[test]
+    #[allow(clippy::approx_constant)]
+    fn conformance_tests_f32() {
+        let cases = [
+            (f32::PI, 0x3fe2dfc5_u32),
+            (10000.0f32, 0x42c80000_u32),
+            (f32::from_bits(0x0000000f), 0x1b2f456f_u32),
+            (f32::INFINITY, f32::INFINITY.to_bits()),
+        ];
+
+        for (input, output) in cases {
+            assert_biteq!(
+                sqrt(input),
+                f32::from_bits(output),
+                "input: {input:?} ({:#018x})",
+                input.to_bits()
+            );
+        }
+    }
+
+    #[test]
+    fn sanity_check_f64() {
+        assert_biteq!(sqrt(100.0f64), 10.0);
+        assert_biteq!(sqrt(4.0f64), 2.0);
+    }
+
+    #[test]
+    fn spec_tests_f64() {
+        spec_test::<f64>();
+    }
+
+    #[test]
+    #[allow(clippy::approx_constant)]
+    fn conformance_tests_f64() {
+        let cases = [
+            (f64::PI, 0x3ffc5bf891b4ef6a_u64),
+            (10000.0, 0x4059000000000000_u64),
+            (f64::from_bits(0x0000000f), 0x1e7efbdeb14f4eda_u64),
+            (f64::INFINITY, f64::INFINITY.to_bits()),
+        ];
+
+        for (input, output) in cases {
+            assert_biteq!(
+                sqrt(input),
+                f64::from_bits(output),
+                "input: {input:?} ({:#018x})",
+                input.to_bits()
+            );
+        }
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/sqrt.rs b/library/compiler-builtins/libm/src/math/sqrt.rs
index 2fd7070b1101..0e1d0cd2c1c9 100644
--- a/library/compiler-builtins/libm/src/math/sqrt.rs
+++ b/library/compiler-builtins/libm/src/math/sqrt.rs
@@ -1,83 +1,3 @@
-/* origin: FreeBSD /usr/src/lib/msun/src/e_sqrt.c */
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunSoft, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-/* sqrt(x)
- * Return correctly rounded sqrt.
- *           ------------------------------------------
- *           |  Use the hardware sqrt if you have one |
- *           ------------------------------------------
- * Method:
- *   Bit by bit method using integer arithmetic. (Slow, but portable)
- *   1. Normalization
- *      Scale x to y in [1,4) with even powers of 2:
- *      find an integer k such that  1 <= (y=x*2^(2k)) < 4, then
- *              sqrt(x) = 2^k * sqrt(y)
- *   2. Bit by bit computation
- *      Let q  = sqrt(y) truncated to i bit after binary point (q = 1),
- *           i                                                   0
- *                                     i+1         2
- *          s  = 2*q , and      y  =  2   * ( y - q  ).         (1)
- *           i      i            i                 i
- *
- *      To compute q    from q , one checks whether
- *                  i+1       i
- *
- *                            -(i+1) 2
- *                      (q + 2      ) <= y.                     (2)
- *                        i
- *                                                            -(i+1)
- *      If (2) is false, then q   = q ; otherwise q   = q  + 2      .
- *                             i+1   i             i+1   i
- *
- *      With some algebraic manipulation, it is not difficult to see
- *      that (2) is equivalent to
- *                             -(i+1)
- *                      s  +  2       <= y                      (3)
- *                       i                i
- *
- *      The advantage of (3) is that s  and y  can be computed by
- *                                    i      i
- *      the following recurrence formula:
- *          if (3) is false
- *
- *          s     =  s  ,       y    = y   ;                    (4)
- *           i+1      i          i+1    i
- *
- *          otherwise,
- *                         -i                     -(i+1)
- *          s     =  s  + 2  ,  y    = y  -  s  - 2             (5)
- *           i+1      i          i+1    i     i
- *
- *      One may easily use induction to prove (4) and (5).
- *      Note. Since the left hand side of (3) contain only i+2 bits,
- *            it does not necessary to do a full (53-bit) comparison
- *            in (3).
- *   3. Final rounding
- *      After generating the 53 bits result, we compute one more bit.
- *      Together with the remainder, we can decide whether the
- *      result is exact, bigger than 1/2ulp, or less than 1/2ulp
- *      (it will never equal to 1/2ulp).
- *      The rounding mode can be detected by checking whether
- *      huge + tiny is equal to huge, and whether huge - tiny is
- *      equal to huge for some floating point number "huge" and "tiny".
- *
- * Special cases:
- *      sqrt(+-0) = +-0         ... exact
- *      sqrt(inf) = inf
- *      sqrt(-ve) = NaN         ... with invalid signal
- *      sqrt(NaN) = NaN         ... with invalid signal for signaling NaN
- */
-
-use core::f64;
-
 /// The square root of `x` (f64).
 #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
 pub fn sqrt(x: f64) -> f64 {
@@ -90,175 +10,5 @@ pub fn sqrt(x: f64) -> f64 {
         args: x,
     }
 
-    use core::num::Wrapping;
-
-    const TINY: f64 = 1.0e-300;
-
-    let mut z: f64;
-    let sign: Wrapping<u32> = Wrapping(0x80000000);
-    let mut ix0: i32;
-    let mut s0: i32;
-    let mut q: i32;
-    let mut m: i32;
-    let mut t: i32;
-    let mut i: i32;
-    let mut r: Wrapping<u32>;
-    let mut t1: Wrapping<u32>;
-    let mut s1: Wrapping<u32>;
-    let mut ix1: Wrapping<u32>;
-    let mut q1: Wrapping<u32>;
-
-    ix0 = (x.to_bits() >> 32) as i32;
-    ix1 = Wrapping(x.to_bits() as u32);
-
-    /* take care of Inf and NaN */
-    if (ix0 & 0x7ff00000) == 0x7ff00000 {
-        return x * x + x; /* sqrt(NaN)=NaN, sqrt(+inf)=+inf, sqrt(-inf)=sNaN */
-    }
-    /* take care of zero */
-    if ix0 <= 0 {
-        if ((ix0 & !(sign.0 as i32)) | ix1.0 as i32) == 0 {
-            return x; /* sqrt(+-0) = +-0 */
-        }
-        if ix0 < 0 {
-            return (x - x) / (x - x); /* sqrt(-ve) = sNaN */
-        }
-    }
-    /* normalize x */
-    m = ix0 >> 20;
-    if m == 0 {
-        /* subnormal x */
-        while ix0 == 0 {
-            m -= 21;
-            ix0 |= (ix1 >> 11).0 as i32;
-            ix1 <<= 21;
-        }
-        i = 0;
-        while (ix0 & 0x00100000) == 0 {
-            i += 1;
-            ix0 <<= 1;
-        }
-        m -= i - 1;
-        ix0 |= (ix1 >> (32 - i) as usize).0 as i32;
-        ix1 = ix1 << i as usize;
-    }
-    m -= 1023; /* unbias exponent */
-    ix0 = (ix0 & 0x000fffff) | 0x00100000;
-    if (m & 1) == 1 {
-        /* odd m, double x to make it even */
-        ix0 *= 2;
-        ix0 += ((ix1 & sign) >> 31).0 as i32;
-        ix1 += ix1;
-    }
-    m >>= 1; /* m = [m/2] */
-
-    /* generate sqrt(x) bit by bit */
-    ix0 *= 2;
-    ix0 += ((ix1 & sign) >> 31).0 as i32;
-    ix1 += ix1;
-    q = 0; /* [q,q1] = sqrt(x) */
-    q1 = Wrapping(0);
-    s0 = 0;
-    s1 = Wrapping(0);
-    r = Wrapping(0x00200000); /* r = moving bit from right to left */
-
-    while r != Wrapping(0) {
-        t = s0 + r.0 as i32;
-        if t <= ix0 {
-            s0 = t + r.0 as i32;
-            ix0 -= t;
-            q += r.0 as i32;
-        }
-        ix0 *= 2;
-        ix0 += ((ix1 & sign) >> 31).0 as i32;
-        ix1 += ix1;
-        r >>= 1;
-    }
-
-    r = sign;
-    while r != Wrapping(0) {
-        t1 = s1 + r;
-        t = s0;
-        if t < ix0 || (t == ix0 && t1 <= ix1) {
-            s1 = t1 + r;
-            if (t1 & sign) == sign && (s1 & sign) == Wrapping(0) {
-                s0 += 1;
-            }
-            ix0 -= t;
-            if ix1 < t1 {
-                ix0 -= 1;
-            }
-            ix1 -= t1;
-            q1 += r;
-        }
-        ix0 *= 2;
-        ix0 += ((ix1 & sign) >> 31).0 as i32;
-        ix1 += ix1;
-        r >>= 1;
-    }
-
-    /* use floating add to find out rounding direction */
-    if (ix0 as u32 | ix1.0) != 0 {
-        z = 1.0 - TINY; /* raise inexact flag */
-        if z >= 1.0 {
-            z = 1.0 + TINY;
-            if q1.0 == 0xffffffff {
-                q1 = Wrapping(0);
-                q += 1;
-            } else if z > 1.0 {
-                if q1.0 == 0xfffffffe {
-                    q += 1;
-                }
-                q1 += Wrapping(2);
-            } else {
-                q1 += q1 & Wrapping(1);
-            }
-        }
-    }
-    ix0 = (q >> 1) + 0x3fe00000;
-    ix1 = q1 >> 1;
-    if (q & 1) == 1 {
-        ix1 |= sign;
-    }
-    ix0 += m << 20;
-    f64::from_bits(((ix0 as u64) << 32) | ix1.0 as u64)
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn sanity_check() {
-        assert_eq!(sqrt(100.0), 10.0);
-        assert_eq!(sqrt(4.0), 2.0);
-    }
-
-    /// The spec: https://en.cppreference.com/w/cpp/numeric/math/sqrt
-    #[test]
-    fn spec_tests() {
-        // Not Asserted: FE_INVALID exception is raised if argument is negative.
-        assert!(sqrt(-1.0).is_nan());
-        assert!(sqrt(f64::NAN).is_nan());
-        for f in [0.0, -0.0, f64::INFINITY].iter().copied() {
-            assert_eq!(sqrt(f), f);
-        }
-    }
-
-    #[test]
-    #[allow(clippy::approx_constant)]
-    fn conformance_tests() {
-        let values = [3.14159265359, 10000.0, f64::from_bits(0x0000000f), f64::INFINITY];
-        let results = [
-            4610661241675116657u64,
-            4636737291354636288u64,
-            2197470602079456986u64,
-            9218868437227405312u64,
-        ];
-
-        for i in 0..values.len() {
-            let bits = f64::to_bits(sqrt(values[i]));
-            assert_eq!(results[i], bits);
-        }
-    }
+    super::generic::sqrt(x)
 }
diff --git a/library/compiler-builtins/libm/src/math/sqrtf.rs b/library/compiler-builtins/libm/src/math/sqrtf.rs
index 319335163ae8..2e69a4b66942 100644
--- a/library/compiler-builtins/libm/src/math/sqrtf.rs
+++ b/library/compiler-builtins/libm/src/math/sqrtf.rs
@@ -1,18 +1,3 @@
-/* origin: FreeBSD /usr/src/lib/msun/src/e_sqrtf.c */
-/*
- * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
- */
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-
 /// The square root of `x` (f32).
 #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
 pub fn sqrtf(x: f32) -> f32 {
@@ -25,121 +10,5 @@ pub fn sqrtf(x: f32) -> f32 {
         args: x,
     }
 
-    const TINY: f32 = 1.0e-30;
-
-    let mut z: f32;
-    let sign: i32 = 0x80000000u32 as i32;
-    let mut ix: i32;
-    let mut s: i32;
-    let mut q: i32;
-    let mut m: i32;
-    let mut t: i32;
-    let mut i: i32;
-    let mut r: u32;
-
-    ix = x.to_bits() as i32;
-
-    /* take care of Inf and NaN */
-    if (ix as u32 & 0x7f800000) == 0x7f800000 {
-        return x * x + x; /* sqrt(NaN)=NaN, sqrt(+inf)=+inf, sqrt(-inf)=sNaN */
-    }
-
-    /* take care of zero */
-    if ix <= 0 {
-        if (ix & !sign) == 0 {
-            return x; /* sqrt(+-0) = +-0 */
-        }
-        if ix < 0 {
-            return (x - x) / (x - x); /* sqrt(-ve) = sNaN */
-        }
-    }
-
-    /* normalize x */
-    m = ix >> 23;
-    if m == 0 {
-        /* subnormal x */
-        i = 0;
-        while ix & 0x00800000 == 0 {
-            ix <<= 1;
-            i = i + 1;
-        }
-        m -= i - 1;
-    }
-    m -= 127; /* unbias exponent */
-    ix = (ix & 0x007fffff) | 0x00800000;
-    if m & 1 == 1 {
-        /* odd m, double x to make it even */
-        ix += ix;
-    }
-    m >>= 1; /* m = [m/2] */
-
-    /* generate sqrt(x) bit by bit */
-    ix += ix;
-    q = 0;
-    s = 0;
-    r = 0x01000000; /* r = moving bit from right to left */
-
-    while r != 0 {
-        t = s + r as i32;
-        if t <= ix {
-            s = t + r as i32;
-            ix -= t;
-            q += r as i32;
-        }
-        ix += ix;
-        r >>= 1;
-    }
-
-    /* use floating add to find out rounding direction */
-    if ix != 0 {
-        z = 1.0 - TINY; /* raise inexact flag */
-        if z >= 1.0 {
-            z = 1.0 + TINY;
-            if z > 1.0 {
-                q += 2;
-            } else {
-                q += q & 1;
-            }
-        }
-    }
-
-    ix = (q >> 1) + 0x3f000000;
-    ix += m << 23;
-    f32::from_bits(ix as u32)
-}
-
-// PowerPC tests are failing on LLVM 13: https://github.com/rust-lang/rust/issues/88520
-#[cfg(not(target_arch = "powerpc64"))]
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn sanity_check() {
-        assert_eq!(sqrtf(100.0), 10.0);
-        assert_eq!(sqrtf(4.0), 2.0);
-    }
-
-    /// The spec: https://en.cppreference.com/w/cpp/numeric/math/sqrt
-    #[test]
-    fn spec_tests() {
-        // Not Asserted: FE_INVALID exception is raised if argument is negative.
-        assert!(sqrtf(-1.0).is_nan());
-        assert!(sqrtf(f32::NAN).is_nan());
-        for f in [0.0, -0.0, f32::INFINITY].iter().copied() {
-            assert_eq!(sqrtf(f), f);
-        }
-    }
-
-    #[test]
-    #[allow(clippy::approx_constant)]
-    fn conformance_tests() {
-        let values = [3.14159265359f32, 10000.0f32, f32::from_bits(0x0000000f), f32::INFINITY];
-        let results = [1071833029u32, 1120403456u32, 456082799u32, 2139095040u32];
-
-        for i in 0..values.len() {
-            let bits = f32::to_bits(sqrtf(values[i]));
-            assert_eq!(results[i], bits);
-        }
-    }
+    super::generic::sqrt(x)
 }
diff --git a/library/compiler-builtins/libm/src/math/support/int_traits.rs b/library/compiler-builtins/libm/src/math/support/int_traits.rs
index db799c030c28..cf19762e8169 100644
--- a/library/compiler-builtins/libm/src/math/support/int_traits.rs
+++ b/library/compiler-builtins/libm/src/math/support/int_traits.rs
@@ -55,10 +55,12 @@ pub trait Int:
     + ops::BitAnd<Output = Self>
     + cmp::Ord
     + CastFrom<i32>
+    + CastFrom<u16>
     + CastFrom<u32>
     + CastFrom<u8>
     + CastFrom<usize>
     + CastInto<i32>
+    + CastInto<u16>
     + CastInto<u32>
     + CastInto<u8>
     + CastInto<usize>
diff --git a/library/compiler-builtins/libm/src/math/support/macros.rs b/library/compiler-builtins/libm/src/math/support/macros.rs
index 076fdf1f775e..c9a36c0db1a7 100644
--- a/library/compiler-builtins/libm/src/math/support/macros.rs
+++ b/library/compiler-builtins/libm/src/math/support/macros.rs
@@ -110,19 +110,21 @@ macro_rules! hf64 {
 /// Assert `F::biteq` with better messages.
 #[cfg(test)]
 macro_rules! assert_biteq {
-    ($left:expr, $right:expr, $($arg:tt)*) => {{
-        let bits = ($left.to_bits() * 0).leading_zeros(); // hack to get the width from the value
+    ($left:expr, $right:expr, $($tt:tt)*) => {{
+        let l = $left;
+        let r = $right;
+        let bits = (l.to_bits() - l.to_bits()).leading_zeros(); // hack to get the width from the value
         assert!(
-            $left.biteq($right),
-            "\nl: {l:?} ({lb:#0width$x})\nr: {r:?} ({rb:#0width$x})",
-            l = $left,
-            lb = $left.to_bits(),
-            r = $right,
-            rb = $right.to_bits(),
-            width = ((bits / 4) + 2) as usize
+            l.biteq(r),
+            "{}\nl: {l:?} ({lb:#0width$x})\nr: {r:?} ({rb:#0width$x})",
+            format_args!($($tt)*),
+            lb = l.to_bits(),
+            rb = r.to_bits(),
+            width = ((bits / 4) + 2) as usize,
+
         );
     }};
     ($left:expr, $right:expr $(,)?) => {
-        assert_biteq!($left, $right,)
+        assert_biteq!($left, $right, "")
     };
 }
diff --git a/library/compiler-builtins/libm/src/math/support/mod.rs b/library/compiler-builtins/libm/src/math/support/mod.rs
index e2f4e0e981d6..b4a57a34ed42 100644
--- a/library/compiler-builtins/libm/src/math/support/mod.rs
+++ b/library/compiler-builtins/libm/src/math/support/mod.rs
@@ -10,3 +10,14 @@ pub(crate) use float_traits::{f32_from_bits, f64_from_bits};
 #[allow(unused_imports)]
 pub use hex_float::{hf32, hf64};
 pub use int_traits::{CastFrom, CastInto, DInt, HInt, Int, MinInt};
+
+/// Hint to the compiler that the current path is cold.
+pub fn cold_path() {
+    #[cfg(intrinsics_enabled)]
+    core::intrinsics::cold_path();
+}
+
+/// Return `x`, first raising `FE_INVALID`.
+pub fn raise_invalid<F: Float>(x: F) -> F {
+    (x - x) / (x - x)
+}

From 03041a0371565f1a32583d770e15f4ffb60232b5 Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Wed, 15 Jan 2025 11:49:28 +0000
Subject: [PATCH 4/5] Copy the u256 implementation from compiler_builtins

---
 .../libm/src/math/support/big.rs              | 302 ++++++++++++++++++
 .../libm/src/math/support/big/tests.rs        | 110 +++++++
 .../libm/src/math/support/mod.rs              |   1 +
 3 files changed, 413 insertions(+)
 create mode 100644 library/compiler-builtins/libm/src/math/support/big.rs
 create mode 100644 library/compiler-builtins/libm/src/math/support/big/tests.rs

diff --git a/library/compiler-builtins/libm/src/math/support/big.rs b/library/compiler-builtins/libm/src/math/support/big.rs
new file mode 100644
index 000000000000..e0f5e52634ef
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/support/big.rs
@@ -0,0 +1,302 @@
+//! Integers used for wide operations, larger than `u128`.
+
+#![allow(unused)]
+
+#[cfg(test)]
+mod tests;
+
+use core::{fmt, ops};
+
+use super::{DInt, HInt, Int, MinInt};
+
+const WORD_LO_MASK: u64 = 0x00000000ffffffff;
+const WORD_HI_MASK: u64 = 0xffffffff00000000;
+const WORD_FULL_MASK: u64 = 0xffffffffffffffff;
+const U128_LO_MASK: u128 = u64::MAX as u128;
+const U128_HI_MASK: u128 = (u64::MAX as u128) << 64;
+
+/// A 256-bit unsigned integer represented as 4 64-bit limbs.
+///
+/// Each limb is a native-endian number, but the array is little-limb-endian.
+#[allow(non_camel_case_types)]
+#[derive(Clone, Copy, Debug, PartialEq, PartialOrd)]
+pub struct u256(pub [u64; 4]);
+
+impl u256 {
+    pub const MAX: Self = Self([u64::MAX, u64::MAX, u64::MAX, u64::MAX]);
+
+    /// Reinterpret as a signed integer
+    pub fn signed(self) -> i256 {
+        i256(self.0)
+    }
+}
+
+/// A 256-bit signed integer represented as 4 64-bit limbs.
+///
+/// Each limb is a native-endian number, but the array is little-limb-endian.
+#[allow(non_camel_case_types)]
+#[derive(Clone, Copy, Debug, PartialEq, PartialOrd)]
+pub struct i256(pub [u64; 4]);
+
+impl i256 {
+    /// Reinterpret as an unsigned integer
+    pub fn unsigned(self) -> u256 {
+        u256(self.0)
+    }
+}
+
+impl MinInt for u256 {
+    type OtherSign = i256;
+
+    type Unsigned = u256;
+
+    const SIGNED: bool = false;
+    const BITS: u32 = 256;
+    const ZERO: Self = Self([0u64; 4]);
+    const ONE: Self = Self([1, 0, 0, 0]);
+    const MIN: Self = Self([0u64; 4]);
+    const MAX: Self = Self([u64::MAX; 4]);
+}
+
+impl MinInt for i256 {
+    type OtherSign = u256;
+
+    type Unsigned = u256;
+
+    const SIGNED: bool = false;
+    const BITS: u32 = 256;
+    const ZERO: Self = Self([0u64; 4]);
+    const ONE: Self = Self([1, 0, 0, 0]);
+    const MIN: Self = Self([0, 0, 0, 1 << 63]);
+    const MAX: Self = Self([u64::MAX, u64::MAX, u64::MAX, u64::MAX << 1]);
+}
+
+macro_rules! impl_common {
+    ($ty:ty) => {
+        impl ops::BitOr for $ty {
+            type Output = Self;
+
+            fn bitor(mut self, rhs: Self) -> Self::Output {
+                self.0[0] |= rhs.0[0];
+                self.0[1] |= rhs.0[1];
+                self.0[2] |= rhs.0[2];
+                self.0[3] |= rhs.0[3];
+                self
+            }
+        }
+
+        impl ops::Not for $ty {
+            type Output = Self;
+
+            fn not(self) -> Self::Output {
+                Self([!self.0[0], !self.0[1], !self.0[2], !self.0[3]])
+            }
+        }
+
+        impl ops::Shl<u32> for $ty {
+            type Output = Self;
+
+            fn shl(self, rhs: u32) -> Self::Output {
+                unimplemented!("only used to meet trait bounds")
+            }
+        }
+    };
+}
+
+impl_common!(i256);
+impl_common!(u256);
+
+impl ops::Shr<u32> for u256 {
+    type Output = Self;
+
+    fn shr(self, rhs: u32) -> Self::Output {
+        assert!(rhs < Self::BITS, "attempted to shift right with overflow");
+
+        if rhs == 0 {
+            return self;
+        }
+
+        let mut ret = self;
+        let byte_shift = rhs / 64;
+        let bit_shift = rhs % 64;
+
+        for idx in 0..4 {
+            let base_idx = idx + byte_shift as usize;
+
+            // FIXME(msrv): could be let...else.
+            let base = match ret.0.get(base_idx) {
+                Some(v) => v,
+                None => {
+                    ret.0[idx] = 0;
+                    continue;
+                }
+            };
+
+            let mut new_val = base >> bit_shift;
+
+            if let Some(new) = ret.0.get(base_idx + 1) {
+                new_val |= new.overflowing_shl(64 - bit_shift).0;
+            }
+
+            ret.0[idx] = new_val;
+        }
+
+        ret
+    }
+}
+
+macro_rules! word {
+    (1, $val:expr) => {
+        (($val >> (32 * 3)) & Self::from(WORD_LO_MASK)) as u64
+    };
+    (2, $val:expr) => {
+        (($val >> (32 * 2)) & Self::from(WORD_LO_MASK)) as u64
+    };
+    (3, $val:expr) => {
+        (($val >> (32 * 1)) & Self::from(WORD_LO_MASK)) as u64
+    };
+    (4, $val:expr) => {
+        (($val >> (32 * 0)) & Self::from(WORD_LO_MASK)) as u64
+    };
+}
+
+impl HInt for u128 {
+    type D = u256;
+
+    fn widen(self) -> Self::D {
+        let w0 = self & u128::from(u64::MAX);
+        let w1 = (self >> u64::BITS) & u128::from(u64::MAX);
+        u256([w0 as u64, w1 as u64, 0, 0])
+    }
+
+    fn zero_widen(self) -> Self::D {
+        self.widen()
+    }
+
+    fn zero_widen_mul(self, rhs: Self) -> Self::D {
+        let product11: u64 = word!(1, self) * word!(1, rhs);
+        let product12: u64 = word!(1, self) * word!(2, rhs);
+        let product13: u64 = word!(1, self) * word!(3, rhs);
+        let product14: u64 = word!(1, self) * word!(4, rhs);
+        let product21: u64 = word!(2, self) * word!(1, rhs);
+        let product22: u64 = word!(2, self) * word!(2, rhs);
+        let product23: u64 = word!(2, self) * word!(3, rhs);
+        let product24: u64 = word!(2, self) * word!(4, rhs);
+        let product31: u64 = word!(3, self) * word!(1, rhs);
+        let product32: u64 = word!(3, self) * word!(2, rhs);
+        let product33: u64 = word!(3, self) * word!(3, rhs);
+        let product34: u64 = word!(3, self) * word!(4, rhs);
+        let product41: u64 = word!(4, self) * word!(1, rhs);
+        let product42: u64 = word!(4, self) * word!(2, rhs);
+        let product43: u64 = word!(4, self) * word!(3, rhs);
+        let product44: u64 = word!(4, self) * word!(4, rhs);
+
+        let sum0: u128 = u128::from(product44);
+        let sum1: u128 = u128::from(product34) + u128::from(product43);
+        let sum2: u128 = u128::from(product24) + u128::from(product33) + u128::from(product42);
+        let sum3: u128 = u128::from(product14)
+            + u128::from(product23)
+            + u128::from(product32)
+            + u128::from(product41);
+        let sum4: u128 = u128::from(product13) + u128::from(product22) + u128::from(product31);
+        let sum5: u128 = u128::from(product12) + u128::from(product21);
+        let sum6: u128 = u128::from(product11);
+
+        let r0: u128 =
+            (sum0 & u128::from(WORD_FULL_MASK)) + ((sum1 & u128::from(WORD_LO_MASK)) << 32);
+        let r1: u128 = (sum0 >> 64)
+            + ((sum1 >> 32) & u128::from(WORD_FULL_MASK))
+            + (sum2 & u128::from(WORD_FULL_MASK))
+            + ((sum3 << 32) & u128::from(WORD_HI_MASK));
+
+        let (lo, carry) = r0.overflowing_add(r1 << 64);
+        let hi = (r1 >> 64)
+            + (sum1 >> 96)
+            + (sum2 >> 64)
+            + (sum3 >> 32)
+            + sum4
+            + (sum5 << 32)
+            + (sum6 << 64)
+            + u128::from(carry);
+
+        u256([
+            (lo & U128_LO_MASK) as u64,
+            ((lo >> 64) & U128_LO_MASK) as u64,
+            (hi & U128_LO_MASK) as u64,
+            ((hi >> 64) & U128_LO_MASK) as u64,
+        ])
+    }
+
+    fn widen_mul(self, rhs: Self) -> Self::D {
+        self.zero_widen_mul(rhs)
+    }
+
+    fn widen_hi(self) -> Self::D {
+        self.widen() << <Self as MinInt>::BITS
+    }
+}
+
+impl HInt for i128 {
+    type D = i256;
+
+    fn widen(self) -> Self::D {
+        let mut ret = self.unsigned().zero_widen().signed();
+        if self.is_negative() {
+            ret.0[2] = u64::MAX;
+            ret.0[3] = u64::MAX;
+        }
+        ret
+    }
+
+    fn zero_widen(self) -> Self::D {
+        self.unsigned().zero_widen().signed()
+    }
+
+    fn zero_widen_mul(self, rhs: Self) -> Self::D {
+        self.unsigned().zero_widen_mul(rhs.unsigned()).signed()
+    }
+
+    fn widen_mul(self, rhs: Self) -> Self::D {
+        unimplemented!("signed i128 widening multiply is not used")
+    }
+
+    fn widen_hi(self) -> Self::D {
+        self.widen() << <Self as MinInt>::BITS
+    }
+}
+
+impl DInt for u256 {
+    type H = u128;
+
+    fn lo(self) -> Self::H {
+        let mut tmp = [0u8; 16];
+        tmp[..8].copy_from_slice(&self.0[0].to_le_bytes());
+        tmp[8..].copy_from_slice(&self.0[1].to_le_bytes());
+        u128::from_le_bytes(tmp)
+    }
+
+    fn hi(self) -> Self::H {
+        let mut tmp = [0u8; 16];
+        tmp[..8].copy_from_slice(&self.0[2].to_le_bytes());
+        tmp[8..].copy_from_slice(&self.0[3].to_le_bytes());
+        u128::from_le_bytes(tmp)
+    }
+}
+
+impl DInt for i256 {
+    type H = i128;
+
+    fn lo(self) -> Self::H {
+        let mut tmp = [0u8; 16];
+        tmp[..8].copy_from_slice(&self.0[0].to_le_bytes());
+        tmp[8..].copy_from_slice(&self.0[1].to_le_bytes());
+        i128::from_le_bytes(tmp)
+    }
+
+    fn hi(self) -> Self::H {
+        let mut tmp = [0u8; 16];
+        tmp[..8].copy_from_slice(&self.0[2].to_le_bytes());
+        tmp[8..].copy_from_slice(&self.0[3].to_le_bytes());
+        i128::from_le_bytes(tmp)
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/support/big/tests.rs b/library/compiler-builtins/libm/src/math/support/big/tests.rs
new file mode 100644
index 000000000000..f95f82973161
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/support/big/tests.rs
@@ -0,0 +1,110 @@
+extern crate std;
+use std::string::String;
+use std::vec::Vec;
+use std::{eprintln, format};
+
+use super::{HInt, MinInt, i256, u256};
+
+const LOHI_SPLIT: u128 = 0xaaaaaaaaaaaaaaaaffffffffffffffff;
+
+/// Print a `u256` as hex since we can't add format implementations
+fn hexu(v: u256) -> String {
+    format!("0x{:016x}{:016x}{:016x}{:016x}", v.0[3], v.0[2], v.0[1], v.0[0])
+}
+
+#[test]
+fn widen_u128() {
+    assert_eq!(u128::MAX.widen(), u256([u64::MAX, u64::MAX, 0, 0]));
+    assert_eq!(LOHI_SPLIT.widen(), u256([u64::MAX, 0xaaaaaaaaaaaaaaaa, 0, 0]));
+}
+
+#[test]
+fn widen_i128() {
+    assert_eq!((-1i128).widen(), u256::MAX.signed());
+    assert_eq!(
+        (LOHI_SPLIT as i128).widen(),
+        i256([u64::MAX, 0xaaaaaaaaaaaaaaaa, u64::MAX, u64::MAX])
+    );
+    assert_eq!((-1i128).zero_widen().unsigned(), (u128::MAX).widen());
+}
+
+#[test]
+fn widen_mul_u128() {
+    let tests = [
+        (u128::MAX / 2, 2_u128, u256([u64::MAX - 1, u64::MAX, 0, 0])),
+        (u128::MAX, 2_u128, u256([u64::MAX - 1, u64::MAX, 1, 0])),
+        (u128::MAX, u128::MAX, u256([1, 0, u64::MAX - 1, u64::MAX])),
+        (u128::MIN, u128::MIN, u256::ZERO),
+        (1234, 0, u256::ZERO),
+        (0, 1234, u256::ZERO),
+    ];
+
+    let mut errors = Vec::new();
+    for (i, (a, b, exp)) in tests.iter().copied().enumerate() {
+        let res = a.widen_mul(b);
+        let res_z = a.zero_widen_mul(b);
+        assert_eq!(res, res_z);
+        if res != exp {
+            errors.push((i, a, b, exp, res));
+        }
+    }
+
+    for (i, a, b, exp, res) in &errors {
+        eprintln!("FAILURE ({i}): {a:#034x} * {b:#034x} = {} got {}", hexu(*exp), hexu(*res));
+    }
+    assert!(errors.is_empty());
+}
+
+#[test]
+fn not_u128() {
+    assert_eq!(!u256::ZERO, u256::MAX);
+}
+
+#[test]
+fn shr_u128() {
+    let only_low = [1, u16::MAX.into(), u32::MAX.into(), u64::MAX.into(), u128::MAX];
+
+    let mut errors = Vec::new();
+
+    for a in only_low {
+        for perturb in 0..10 {
+            let a = a.saturating_add(perturb);
+            for shift in 0..128 {
+                let res = a.widen() >> shift;
+                let expected = (a >> shift).widen();
+                if res != expected {
+                    errors.push((a.widen(), shift, res, expected));
+                }
+            }
+        }
+    }
+
+    let check = [
+        (u256::MAX, 1, u256([u64::MAX, u64::MAX, u64::MAX, u64::MAX >> 1])),
+        (u256::MAX, 5, u256([u64::MAX, u64::MAX, u64::MAX, u64::MAX >> 5])),
+        (u256::MAX, 63, u256([u64::MAX, u64::MAX, u64::MAX, 1])),
+        (u256::MAX, 64, u256([u64::MAX, u64::MAX, u64::MAX, 0])),
+        (u256::MAX, 65, u256([u64::MAX, u64::MAX, u64::MAX >> 1, 0])),
+        (u256::MAX, 127, u256([u64::MAX, u64::MAX, 1, 0])),
+        (u256::MAX, 128, u256([u64::MAX, u64::MAX, 0, 0])),
+        (u256::MAX, 129, u256([u64::MAX, u64::MAX >> 1, 0, 0])),
+        (u256::MAX, 191, u256([u64::MAX, 1, 0, 0])),
+        (u256::MAX, 192, u256([u64::MAX, 0, 0, 0])),
+        (u256::MAX, 193, u256([u64::MAX >> 1, 0, 0, 0])),
+        (u256::MAX, 191, u256([u64::MAX, 1, 0, 0])),
+        (u256::MAX, 254, u256([0b11, 0, 0, 0])),
+        (u256::MAX, 255, u256([1, 0, 0, 0])),
+    ];
+
+    for (input, shift, expected) in check {
+        let res = input >> shift;
+        if res != expected {
+            errors.push((input, shift, res, expected));
+        }
+    }
+
+    for (a, b, res, expected) in &errors {
+        eprintln!("FAILURE: {} >> {b} = {} got {}", hexu(*a), hexu(*expected), hexu(*res),);
+    }
+    assert!(errors.is_empty());
+}
diff --git a/library/compiler-builtins/libm/src/math/support/mod.rs b/library/compiler-builtins/libm/src/math/support/mod.rs
index b4a57a34ed42..ddfc2e3e0717 100644
--- a/library/compiler-builtins/libm/src/math/support/mod.rs
+++ b/library/compiler-builtins/libm/src/math/support/mod.rs
@@ -1,5 +1,6 @@
 #[macro_use]
 pub mod macros;
+mod big;
 mod float_traits;
 mod hex_float;
 mod int_traits;

From 186eac9227e83b9ba0d771ac9326632d42e25086 Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Tue, 21 Jan 2025 23:59:07 +0000
Subject: [PATCH 5/5] Add `sqrtf16` and `sqrtf128`

Use the generic algorithms to provide implementations for these
routines.
---
 .../libm/crates/libm-macros/src/shared.rs     |  4 +-
 .../libm/crates/libm-test/benches/icount.rs   |  2 +
 .../libm/crates/libm-test/benches/random.rs   |  2 +
 .../libm-test/tests/compare_built_musl.rs     |  2 +
 .../libm/crates/util/src/main.rs              |  2 +
 .../libm/etc/function-definitions.json        | 14 +++
 .../libm/etc/function-list.txt                |  2 +
 .../libm/src/math/generic/sqrt.rs             | 92 +++++++++++++++++++
 .../compiler-builtins/libm/src/math/mod.rs    |  4 +
 .../libm/src/math/sqrtf128.rs                 |  5 +
 .../libm/src/math/sqrtf16.rs                  |  5 +
 11 files changed, 132 insertions(+), 2 deletions(-)
 create mode 100644 library/compiler-builtins/libm/src/math/sqrtf128.rs
 create mode 100644 library/compiler-builtins/libm/src/math/sqrtf16.rs

diff --git a/library/compiler-builtins/libm/crates/libm-macros/src/shared.rs b/library/compiler-builtins/libm/crates/libm-macros/src/shared.rs
index 60838196236f..d17bc6ffccb7 100644
--- a/library/compiler-builtins/libm/crates/libm-macros/src/shared.rs
+++ b/library/compiler-builtins/libm/crates/libm-macros/src/shared.rs
@@ -9,7 +9,7 @@ const ALL_OPERATIONS_NESTED: &[(FloatTy, Signature, Option<Signature>, &[&str])]
         FloatTy::F16,
         Signature { args: &[Ty::F16], returns: &[Ty::F16] },
         None,
-        &["fabsf16", "truncf16"],
+        &["fabsf16", "sqrtf16", "truncf16"],
     ),
     (
         // `fn(f32) -> f32`
@@ -40,7 +40,7 @@ const ALL_OPERATIONS_NESTED: &[(FloatTy, Signature, Option<Signature>, &[&str])]
         FloatTy::F128,
         Signature { args: &[Ty::F128], returns: &[Ty::F128] },
         None,
-        &["fabsf128", "truncf128"],
+        &["fabsf128", "sqrtf128", "truncf128"],
     ),
     (
         // `(f16, f16) -> f16`
diff --git a/library/compiler-builtins/libm/crates/libm-test/benches/icount.rs b/library/compiler-builtins/libm/crates/libm-test/benches/icount.rs
index 3a66249e85dc..c8451f88cd38 100644
--- a/library/compiler-builtins/libm/crates/libm-test/benches/icount.rs
+++ b/library/compiler-builtins/libm/crates/libm-test/benches/icount.rs
@@ -155,6 +155,8 @@ main!(
     icount_bench_sinh_group,
     icount_bench_sinhf_group,
     icount_bench_sqrt_group,
+    icount_bench_sqrtf128_group,
+    icount_bench_sqrtf16_group,
     icount_bench_sqrtf_group,
     icount_bench_tan_group,
     icount_bench_tanf_group,
diff --git a/library/compiler-builtins/libm/crates/libm-test/benches/random.rs b/library/compiler-builtins/libm/crates/libm-test/benches/random.rs
index 888161265100..026841202532 100644
--- a/library/compiler-builtins/libm/crates/libm-test/benches/random.rs
+++ b/library/compiler-builtins/libm/crates/libm-test/benches/random.rs
@@ -123,6 +123,8 @@ libm_macros::for_each_function! {
         | fabsf16
         | fdimf128
         | fdimf16
+        | sqrtf16
+        | sqrtf128
         | truncf128
         | truncf16 => (false, None),
 
diff --git a/library/compiler-builtins/libm/crates/libm-test/tests/compare_built_musl.rs b/library/compiler-builtins/libm/crates/libm-test/tests/compare_built_musl.rs
index f540a0b15517..24703f273168 100644
--- a/library/compiler-builtins/libm/crates/libm-test/tests/compare_built_musl.rs
+++ b/library/compiler-builtins/libm/crates/libm-test/tests/compare_built_musl.rs
@@ -87,5 +87,7 @@ libm_macros::for_each_function! {
         fdimf16,
         truncf128,
         truncf16,
+        sqrtf16,
+        sqrtf128,
     ],
 }
diff --git a/library/compiler-builtins/libm/crates/util/src/main.rs b/library/compiler-builtins/libm/crates/util/src/main.rs
index b979c60ad3c0..cd68d9afd1a2 100644
--- a/library/compiler-builtins/libm/crates/util/src/main.rs
+++ b/library/compiler-builtins/libm/crates/util/src/main.rs
@@ -90,6 +90,8 @@ fn do_eval(basis: &str, op: &str, inputs: &[&str]) {
             | fabsf16
             | fdimf128
             | fdimf16
+            | sqrtf128
+            | sqrtf16
             | truncf128
             | truncf16  => None,
             _ => Some(musl_math_sys::MACRO_FN_NAME)
diff --git a/library/compiler-builtins/libm/etc/function-definitions.json b/library/compiler-builtins/libm/etc/function-definitions.json
index 9f7c8ab25a03..2d0af3bcff02 100644
--- a/library/compiler-builtins/libm/etc/function-definitions.json
+++ b/library/compiler-builtins/libm/etc/function-definitions.json
@@ -718,6 +718,20 @@
         ],
         "type": "f32"
     },
+    "sqrtf128": {
+        "sources": [
+            "src/math/generic/sqrt.rs",
+            "src/math/sqrtf128.rs"
+        ],
+        "type": "f128"
+    },
+    "sqrtf16": {
+        "sources": [
+            "src/math/generic/sqrt.rs",
+            "src/math/sqrtf16.rs"
+        ],
+        "type": "f16"
+    },
     "tan": {
         "sources": [
             "src/libm_helper.rs",
diff --git a/library/compiler-builtins/libm/etc/function-list.txt b/library/compiler-builtins/libm/etc/function-list.txt
index 7f96a436227e..47c34ab90521 100644
--- a/library/compiler-builtins/libm/etc/function-list.txt
+++ b/library/compiler-builtins/libm/etc/function-list.txt
@@ -105,6 +105,8 @@ sinh
 sinhf
 sqrt
 sqrtf
+sqrtf128
+sqrtf16
 tan
 tanf
 tanh
diff --git a/library/compiler-builtins/libm/src/math/generic/sqrt.rs b/library/compiler-builtins/libm/src/math/generic/sqrt.rs
index a2e054f3cf2b..c892f9997241 100644
--- a/library/compiler-builtins/libm/src/math/generic/sqrt.rs
+++ b/library/compiler-builtins/libm/src/math/generic/sqrt.rs
@@ -294,6 +294,14 @@ pub trait SqrtHelper: Float {
     const FINAL_ROUNDS: u32;
 }
 
+#[cfg(f16_enabled)]
+impl SqrtHelper for f16 {
+    type ISet1 = u16; // unused
+    type ISet2 = u16; // unused
+
+    const FINAL_ROUNDS: u32 = 2;
+}
+
 impl SqrtHelper for f32 {
     type ISet1 = u32; // unused
     type ISet2 = u32; // unused
@@ -309,6 +317,16 @@ impl SqrtHelper for f64 {
     const FINAL_ROUNDS: u32 = 2;
 }
 
+#[cfg(f128_enabled)]
+impl SqrtHelper for f128 {
+    type ISet1 = u32;
+    type ISet2 = u64;
+
+    const SET1_ROUNDS: u32 = 1;
+    const SET2_ROUNDS: u32 = 2;
+    const FINAL_ROUNDS: u32 = 2;
+}
+
 /// A U0.16 representation of `1/sqrt(x)`.
 ///
 // / The index is a 7-bit number consisting of a single exponent bit and 6 bits of significand.
@@ -355,6 +373,42 @@ mod tests {
         }
     }
 
+    #[test]
+    #[cfg(f16_enabled)]
+    fn sanity_check_f16() {
+        assert_biteq!(sqrt(100.0f16), 10.0);
+        assert_biteq!(sqrt(4.0f16), 2.0);
+    }
+
+    #[test]
+    #[cfg(f16_enabled)]
+    fn spec_tests_f16() {
+        spec_test::<f16>();
+    }
+
+    #[test]
+    #[cfg(f16_enabled)]
+    #[allow(clippy::approx_constant)]
+    fn conformance_tests_f16() {
+        let cases = [
+            (f16::PI, 0x3f17_u16),
+            // 10_000.0, using a hex literal for MSRV hack (Rust < 1.67 checks literal widths as
+            // part of the AST, so the `cfg` is irrelevant here).
+            (f16::from_bits(0x70e2), 0x5640_u16),
+            (f16::from_bits(0x0000000f), 0x13bf_u16),
+            (f16::INFINITY, f16::INFINITY.to_bits()),
+        ];
+
+        for (input, output) in cases {
+            assert_biteq!(
+                sqrt(input),
+                f16::from_bits(output),
+                "input: {input:?} ({:#018x})",
+                input.to_bits()
+            );
+        }
+    }
+
     #[test]
     fn sanity_check_f32() {
         assert_biteq!(sqrt(100.0f32), 10.0);
@@ -416,4 +470,42 @@ mod tests {
             );
         }
     }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn sanity_check_f128() {
+        assert_biteq!(sqrt(100.0f128), 10.0);
+        assert_biteq!(sqrt(4.0f128), 2.0);
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn spec_tests_f128() {
+        spec_test::<f128>();
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    #[allow(clippy::approx_constant)]
+    fn conformance_tests_f128() {
+        let cases = [
+            (f128::PI, 0x3fffc5bf891b4ef6aa79c3b0520d5db9_u128),
+            // 10_000.0, see `f16` for reasoning.
+            (
+                f128::from_bits(0x400c3880000000000000000000000000),
+                0x40059000000000000000000000000000_u128,
+            ),
+            (f128::from_bits(0x0000000f), 0x1fc9efbdeb14f4ed9b17ae807907e1e9_u128),
+            (f128::INFINITY, f128::INFINITY.to_bits()),
+        ];
+
+        for (input, output) in cases {
+            assert_biteq!(
+                sqrt(input),
+                f128::from_bits(output),
+                "input: {input:?} ({:#018x})",
+                input.to_bits()
+            );
+        }
+    }
 }
diff --git a/library/compiler-builtins/libm/src/math/mod.rs b/library/compiler-builtins/libm/src/math/mod.rs
index 03adb6be1934..3684025a6c10 100644
--- a/library/compiler-builtins/libm/src/math/mod.rs
+++ b/library/compiler-builtins/libm/src/math/mod.rs
@@ -344,11 +344,13 @@ cfg_if! {
         mod copysignf16;
         mod fabsf16;
         mod fdimf16;
+        mod sqrtf16;
         mod truncf16;
 
         pub use self::copysignf16::copysignf16;
         pub use self::fabsf16::fabsf16;
         pub use self::fdimf16::fdimf16;
+        pub use self::sqrtf16::sqrtf16;
         pub use self::truncf16::truncf16;
     }
 }
@@ -358,11 +360,13 @@ cfg_if! {
         mod copysignf128;
         mod fabsf128;
         mod fdimf128;
+        mod sqrtf128;
         mod truncf128;
 
         pub use self::copysignf128::copysignf128;
         pub use self::fabsf128::fabsf128;
         pub use self::fdimf128::fdimf128;
+        pub use self::sqrtf128::sqrtf128;
         pub use self::truncf128::truncf128;
     }
 }
diff --git a/library/compiler-builtins/libm/src/math/sqrtf128.rs b/library/compiler-builtins/libm/src/math/sqrtf128.rs
new file mode 100644
index 000000000000..eaef6ae0c1c8
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/sqrtf128.rs
@@ -0,0 +1,5 @@
+/// The square root of `x` (f128).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn sqrtf128(x: f128) -> f128 {
+    return super::generic::sqrt(x);
+}
diff --git a/library/compiler-builtins/libm/src/math/sqrtf16.rs b/library/compiler-builtins/libm/src/math/sqrtf16.rs
new file mode 100644
index 000000000000..549bf902c722
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/sqrtf16.rs
@@ -0,0 +1,5 @@
+/// The square root of `x` (f16).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn sqrtf16(x: f16) -> f16 {
+    return super::generic::sqrt(x);
+}