Add scalbnf16, scalbnf128, ldexpf16, and ldexpf128

Use the generic `scalbn` to provide `f16` and `f128` versions, which also work for `ldexp`. This involves a new algorithm for `f16` because the default does not converge fast enough with a limited number of rounds.
2025-01-03 04:34:21 +00:00 · 2025-01-03 04:34:21 +00:00 · cc2874c9a9
commit cc2874c9a9
parent 98bee053ef
15 changed files with 195 additions and 39 deletions
--- a/library/compiler-builtins/libm/crates/libm-macros/src/shared.rs
+++ b/library/compiler-builtins/libm/crates/libm-macros/src/shared.rs
@ -134,6 +134,13 @@ const ALL_OPERATIONS_NESTED: &[(FloatTy, Signature, Option<Signature>, &[&str])]
        None,
        &["jn", "yn"],
    ),
+    (
+        // `(f16, i32) -> f16`
+        FloatTy::F16,
+        Signature { args: &[Ty::F16, Ty::I32], returns: &[Ty::F16] },
+        None,
+        &["scalbnf16", "ldexpf16"],
+    ),
    (
        // `(f32, i32) -> f32`
        FloatTy::F32,
@ -148,6 +155,13 @@ const ALL_OPERATIONS_NESTED: &[(FloatTy, Signature, Option<Signature>, &[&str])]
        None,
        &["scalbn", "ldexp"],
    ),
+    (
+        // `(f128, i32) -> f128`
+        FloatTy::F128,
+        Signature { args: &[Ty::F128, Ty::I32], returns: &[Ty::F128] },
+        None,
+        &["scalbnf128", "ldexpf128"],
+    ),
    (
        // `(f32, &mut f32) -> f32` as `(f32) -> (f32, f32)`
        FloatTy::F32,
--- a/library/compiler-builtins/libm/crates/libm-test/benches/icount.rs
+++ b/library/compiler-builtins/libm/crates/libm-test/benches/icount.rs
@ -131,6 +131,8 @@ main!(
    icount_bench_jn_group,
    icount_bench_jnf_group,
    icount_bench_ldexp_group,
+    icount_bench_ldexpf128_group,
+    icount_bench_ldexpf16_group,
    icount_bench_ldexpf_group,
    icount_bench_lgamma_group,
    icount_bench_lgamma_r_group,
@ -163,6 +165,8 @@ main!(
    icount_bench_roundf16_group,
    icount_bench_roundf_group,
    icount_bench_scalbn_group,
+    icount_bench_scalbnf128_group,
+    icount_bench_scalbnf16_group,
    icount_bench_scalbnf_group,
    icount_bench_sin_group,
    icount_bench_sinf_group,
--- a/library/compiler-builtins/libm/crates/libm-test/benches/random.rs
+++ b/library/compiler-builtins/libm/crates/libm-test/benches/random.rs
@ -133,10 +133,14 @@ libm_macros::for_each_function! {
        | fminf16
        | fmodf128
        | fmodf16
+        | ldexpf128
+        | ldexpf16
        | rintf128
        | rintf16
        | roundf128
        | roundf16
+        | scalbnf128
+        | scalbnf16
        | sqrtf128
        | sqrtf16
        | truncf128
--- a/library/compiler-builtins/libm/crates/libm-test/src/mpfloat.rs
+++ b/library/compiler-builtins/libm/crates/libm-test/src/mpfloat.rs
@ -159,6 +159,8 @@ libm_macros::for_each_function! {
        jnf,
        ldexp,
        ldexpf,
+        ldexpf128,
+        ldexpf16,
        lgamma_r,
        lgammaf_r,
        modf,
@ -178,6 +180,8 @@ libm_macros::for_each_function! {
        roundf16,
        scalbn,
        scalbnf,
+        scalbnf128,
+        scalbnf16,
        sincos,sincosf,
        trunc,
        truncf,
@ -351,34 +355,6 @@ macro_rules! impl_op_for_ty {
                }
            }

-            // `ldexp` and `scalbn` are the same for binary floating point, so just forward all
-            // methods.
-            impl MpOp for crate::op::[<ldexp $suffix>]::Routine {
-                type MpTy = <crate::op::[<scalbn $suffix>]::Routine as MpOp>::MpTy;
-
-                fn new_mp() -> Self::MpTy {
-                    <crate::op::[<scalbn $suffix>]::Routine as MpOp>::new_mp()
-                }
-
-                fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
-                    <crate::op::[<scalbn $suffix>]::Routine as MpOp>::run(this, input)
-                }
-            }
-
-            impl MpOp for crate::op::[<scalbn $suffix>]::Routine {
-                type MpTy = MpFloat;
-
-                fn new_mp() -> Self::MpTy {
-                    new_mpfloat::<Self::FTy>()
-                }
-
-                fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
-                    this.assign(input.0);
-                    *this <<= input.1;
-                    prep_retval::<Self::FTy>(this, Ordering::Equal)
-                }
-            }
-
            impl MpOp for crate::op::[<sincos $suffix>]::Routine {
                type MpTy = (MpFloat, MpFloat);

@ -464,6 +440,35 @@ macro_rules! impl_op_for_ty_all {
                    this.1.assign(input.1);
                    let ord = this.0.rem_assign_round(&this.1, Nearest);
                    prep_retval::<Self::RustRet>(&mut this.0, ord)
+
+                }
+            }
+
+            // `ldexp` and `scalbn` are the same for binary floating point, so just forward all
+            // methods.
+            impl MpOp for crate::op::[<ldexp $suffix>]::Routine {
+                type MpTy = <crate::op::[<scalbn $suffix>]::Routine as MpOp>::MpTy;
+
+                fn new_mp() -> Self::MpTy {
+                    <crate::op::[<scalbn $suffix>]::Routine as MpOp>::new_mp()
+                }
+
+                fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
+                    <crate::op::[<scalbn $suffix>]::Routine as MpOp>::run(this, input)
+                }
+            }
+
+            impl MpOp for crate::op::[<scalbn $suffix>]::Routine {
+                type MpTy = MpFloat;
+
+                fn new_mp() -> Self::MpTy {
+                    new_mpfloat::<Self::FTy>()
+                }
+
+                fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
+                    this.assign(input.0);
+                    *this <<= input.1;
+                    prep_retval::<Self::FTy>(this, Ordering::Equal)
                }
            }
        }
--- a/library/compiler-builtins/libm/crates/libm-test/src/precision.rs
+++ b/library/compiler-builtins/libm/crates/libm-test/src/precision.rs
@ -551,8 +551,12 @@ fn int_float_common<F1: Float, F2: Float>(
    DEFAULT
 }

+#[cfg(f16_enabled)]
+impl MaybeOverride<(f16, i32)> for SpecialCase {}
 impl MaybeOverride<(f32, i32)> for SpecialCase {}
 impl MaybeOverride<(f64, i32)> for SpecialCase {}
+#[cfg(f128_enabled)]
+impl MaybeOverride<(f128, i32)> for SpecialCase {}

 impl MaybeOverride<(f32, f32, f32)> for SpecialCase {
    fn check_float<F: Float>(
--- a/library/compiler-builtins/libm/crates/libm-test/tests/compare_built_musl.rs
+++ b/library/compiler-builtins/libm/crates/libm-test/tests/compare_built_musl.rs
@ -95,10 +95,14 @@ libm_macros::for_each_function! {
        fminf16,
        fmodf128,
        fmodf16,
+        ldexpf128,
+        ldexpf16,
        rintf128,
        rintf16,
        roundf128,
        roundf16,
+        scalbnf128,
+        scalbnf16,
        sqrtf128,
        sqrtf16,
        truncf128,
--- a/library/compiler-builtins/libm/crates/util/src/main.rs
+++ b/library/compiler-builtins/libm/crates/util/src/main.rs
@ -102,10 +102,14 @@ fn do_eval(basis: &str, op: &str, inputs: &[&str]) {
            | fminf16
            | fmodf128
            | fmodf16
+            | ldexpf128
+            | ldexpf16
            | rintf128
            | rintf16
            | roundf128
            | roundf16
+            | scalbnf128
+            | scalbnf16
            | sqrtf128
            | sqrtf16
            | truncf128
--- a/library/compiler-builtins/libm/etc/function-definitions.json
+++ b/library/compiler-builtins/libm/etc/function-definitions.json
@ -554,6 +554,18 @@
        ],
        "type": "f32"
    },
+    "ldexpf128": {
+        "sources": [
+            "src/math/ldexpf128.rs"
+        ],
+        "type": "f128"
+    },
+    "ldexpf16": {
+        "sources": [
+            "src/math/ldexpf16.rs"
+        ],
+        "type": "f16"
+    },
    "lgamma": {
        "sources": [
            "src/libm_helper.rs",
@ -774,6 +786,20 @@
        ],
        "type": "f32"
    },
+    "scalbnf128": {
+        "sources": [
+            "src/math/generic/scalbn.rs",
+            "src/math/scalbnf128.rs"
+        ],
+        "type": "f128"
+    },
+    "scalbnf16": {
+        "sources": [
+            "src/math/generic/scalbn.rs",
+            "src/math/scalbnf16.rs"
+        ],
+        "type": "f16"
+    },
    "sin": {
        "sources": [
            "src/libm_helper.rs",
--- a/library/compiler-builtins/libm/etc/function-list.txt
+++ b/library/compiler-builtins/libm/etc/function-list.txt
@ -79,6 +79,8 @@ jn
 jnf
 ldexp
 ldexpf
+ldexpf128
+ldexpf16
 lgamma
 lgamma_r
 lgammaf
@ -111,6 +113,8 @@ roundf128
 roundf16
 scalbn
 scalbnf
+scalbnf128
+scalbnf16
 sin
 sincos
 sincosf
--- a/library/compiler-builtins/libm/src/math/generic/scalbn.rs
+++ b/library/compiler-builtins/libm/src/math/generic/scalbn.rs
@ -31,16 +31,27 @@ where
    let exp_max: i32 = F::EXP_BIAS as i32;
    let exp_min = -(exp_max - 1);

-    // 2 ^ Emax, where Emax is the maximum biased exponent value (1023 for f64)
+    // 2 ^ Emax, maximum positive with null significand (0x1p1023 for f64)
    let f_exp_max = F::from_parts(false, F::EXP_BIAS << 1, zero);

-    // 2 ^ Emin, where Emin is the minimum biased exponent value (-1022 for f64)
+    // 2 ^ Emin, minimum positive normal with null significand (0x1p-1022 for f64)
    let f_exp_min = F::from_parts(false, 1, zero);

-    // 2 ^ sig_total_bits, representation of what can be accounted for with subnormals
-    let f_exp_subnorm = F::from_parts(false, sig_total_bits + F::EXP_BIAS, zero);
+    // 2 ^ sig_total_bits, moltiplier to normalize subnormals (0x1p53 for f64)
+    let f_pow_subnorm = F::from_parts(false, sig_total_bits + F::EXP_BIAS, zero);
+
+    /*
+     * The goal is to multiply `x` by a scale factor that applies `n`. However, there are cases
+     * where `2^n` is not representable by `F` but the result should be, e.g. `x = 2^Emin` with
+     * `n = -EMin + 2` (one out of range of 2^Emax). To get around this, reduce the magnitude of
+     * the final scale operation by prescaling by the max/min power representable by `F`.
+     */

    if n > exp_max {
+        // Worse case positive `n`: `x`  is the minimum subnormal value, the result is `F::MAX`.
+        // This can be reached by three scaling multiplications (two here and one final).
+        debug_assert!(-exp_min + F::SIG_BITS as i32 + exp_max <= exp_max * 3);
+
        x *= f_exp_max;
        n -= exp_max;
        if n > exp_max {
@ -51,21 +62,61 @@ where
            }
        }
    } else if n < exp_min {
-        let mul = f_exp_min * f_exp_subnorm;
-        let add = (exp_max - 1) - sig_total_bits as i32;
+        // When scaling toward 0, the prescaling is limited to a value that does not allow `x` to
+        // go subnormal. This avoids double rounding.
+        if F::BITS > 16 {
+            // `mul` s.t. `!(x * mul).is_subnormal() ∀ x`
+            let mul = f_exp_min * f_pow_subnorm;
+            let add = -exp_min - sig_total_bits as i32;
+
+            // Worse case negative `n`: `x`  is the maximum positive value, the result is `F::MIN`.
+            // This must be reachable by three scaling multiplications (two here and one final).
+            debug_assert!(-exp_min + F::SIG_BITS as i32 + exp_max <= add * 2 + -exp_min);

-        x *= mul;
-        n += add;
-        if n < exp_min {
            x *= mul;
            n += add;
+
            if n < exp_min {
-                n = exp_min;
+                x *= mul;
+                n += add;
+
+                if n < exp_min {
+                    n = exp_min;
+                }
+            }
+        } else {
+            // `f16` is unique compared to other float types in that the difference between the
+            // minimum exponent and the significand bits (`add = -exp_min - sig_total_bits`) is
+            // small, only three. The above method depend on decrementing `n` by `add` two times;
+            // for other float types this works out because `add` is a substantial fraction of
+            // the exponent range. For `f16`, however, 3 is relatively small compared to the
+            // exponent range (which is 39), so that requires ~10 prescale rounds rather than two.
+            //
+            // Work aroudn this by using a different algorithm that calculates the prescale
+            // dynamically based on the maximum possible value. This adds more operations per round
+            // since it needs to construct the scale, but works better in the general case.
+            let add = -(n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32);
+            let mul = F::from_parts(false, (F::EXP_BIAS as i32 - add) as u32, zero);
+
+            x *= mul;
+            n += add;
+
+            if n < exp_min {
+                let add = -(n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32);
+                let mul = F::from_parts(false, (F::EXP_BIAS as i32 - add) as u32, zero);
+
+                x *= mul;
+                n += add;
+
+                if n < exp_min {
+                    n = exp_min;
+                }
            }
        }
    }

-    x * F::from_parts(false, (F::EXP_BIAS as i32 + n) as u32, zero)
+    let scale = F::from_parts(false, (F::EXP_BIAS as i32 + n) as u32, zero);
+    x * scale
 }

 #[cfg(test)]
@ -111,6 +162,12 @@ mod tests {
        assert!(scalbn(-F::NAN, -10).is_nan());
    }

+    #[test]
+    #[cfg(f16_enabled)]
+    fn spec_test_f16() {
+        spec_test::<f16>();
+    }
+
    #[test]
    fn spec_test_f32() {
        spec_test::<f32>();
@ -120,4 +177,10 @@ mod tests {
    fn spec_test_f64() {
        spec_test::<f64>();
    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn spec_test_f128() {
+        spec_test::<f128>();
+    }
 }
--- a/library/compiler-builtins/libm/src/math/ldexpf128.rs
+++ b/library/compiler-builtins/libm/src/math/ldexpf128.rs
@ -0,0 +1,4 @@
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn ldexpf128(x: f128, n: i32) -> f128 {
+    super::scalbnf128(x, n)
+}
--- a/library/compiler-builtins/libm/src/math/ldexpf16.rs
+++ b/library/compiler-builtins/libm/src/math/ldexpf16.rs
@ -0,0 +1,4 @@
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn ldexpf16(x: f16, n: i32) -> f16 {
+    super::scalbnf16(x, n)
+}
--- a/library/compiler-builtins/libm/src/math/mod.rs
+++ b/library/compiler-builtins/libm/src/math/mod.rs
@ -349,8 +349,10 @@ cfg_if! {
        mod fmaxf16;
        mod fminf16;
        mod fmodf16;
+        mod ldexpf16;
        mod rintf16;
        mod roundf16;
+        mod scalbnf16;
        mod sqrtf16;
        mod truncf16;

@ -362,8 +364,10 @@ cfg_if! {
        pub use self::fmaxf16::fmaxf16;
        pub use self::fminf16::fminf16;
        pub use self::fmodf16::fmodf16;
+        pub use self::ldexpf16::ldexpf16;
        pub use self::rintf16::rintf16;
        pub use self::roundf16::roundf16;
+        pub use self::scalbnf16::scalbnf16;
        pub use self::sqrtf16::sqrtf16;
        pub use self::truncf16::truncf16;
    }
@ -379,8 +383,10 @@ cfg_if! {
        mod fmaxf128;
        mod fminf128;
        mod fmodf128;
+        mod ldexpf128;
        mod rintf128;
        mod roundf128;
+        mod scalbnf128;
        mod sqrtf128;
        mod truncf128;

@ -392,8 +398,10 @@ cfg_if! {
        pub use self::fmaxf128::fmaxf128;
        pub use self::fminf128::fminf128;
        pub use self::fmodf128::fmodf128;
+        pub use self::ldexpf128::ldexpf128;
        pub use self::rintf128::rintf128;
        pub use self::roundf128::roundf128;
+        pub use self::scalbnf128::scalbnf128;
        pub use self::sqrtf128::sqrtf128;
        pub use self::truncf128::truncf128;
    }
--- a/library/compiler-builtins/libm/src/math/scalbnf128.rs
+++ b/library/compiler-builtins/libm/src/math/scalbnf128.rs
@ -0,0 +1,4 @@
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn scalbnf128(x: f128, n: i32) -> f128 {
+    super::generic::scalbn(x, n)
+}
--- a/library/compiler-builtins/libm/src/math/scalbnf16.rs
+++ b/library/compiler-builtins/libm/src/math/scalbnf16.rs
@ -0,0 +1,4 @@
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn scalbnf16(x: f16, n: i32) -> f16 {
+    super::generic::scalbn(x, n)
+}