libm: Optimize fmod

This is kind of a retry at rust-lang/compiler-builtins#898. One of the
problems there was that it would have added overhead and regressed
performance for typical inputs.

Unlike that PR, this doesn't aim for sub-linear scaling; the cost of
evaluating `fmod(x, y)` is still roughly proportional to `log2(|x/y|)`.
However, the constant factor is much better. Running the
`random`-benchmarks locally, I got walltime reductions of

    fmodf16:  -56.9%
    fmodf:    -85.0%
    fmod:     -95.4%
    fmodf128: -98.7%
This commit is contained in:
quaternic 2025-08-01 02:20:04 +03:00 committed by Trevor Gross
parent ec65d89176
commit e2ee56b206
3 changed files with 57 additions and 12 deletions

View file

@ -1,8 +1,12 @@
/* SPDX-License-Identifier: MIT OR Apache-2.0 */
use crate::support::{CastFrom, Float, Int, MinInt};
use crate::support::{CastFrom, CastInto, Float, HInt, Int, MinInt, NarrowingDiv};
#[inline]
pub fn fmod<F: Float>(x: F, y: F) -> F {
pub fn fmod<F: Float>(x: F, y: F) -> F
where
F::Int: HInt,
<F::Int as HInt>::D: NarrowingDiv,
{
let _1 = F::Int::ONE;
let sx = x.to_bits() & F::SIGN_MASK;
let ux = x.to_bits() & !F::SIGN_MASK;
@ -29,7 +33,7 @@ pub fn fmod<F: Float>(x: F, y: F) -> F {
// To compute `(num << ex) % (div << ey)`, first
// evaluate `rem = (num << (ex - ey)) % div` ...
let rem = reduction(num, ex - ey, div);
let rem = reduction::<F>(num, ex - ey, div);
// ... so the result will be `rem << ey`
if rem.is_zero() {
@ -58,11 +62,55 @@ fn into_sig_exp<F: Float>(mut bits: F::Int) -> (F::Int, u32) {
}
/// Compute the remainder `(x * 2.pow(e)) % y` without overflow.
fn reduction<I: Int>(mut x: I, e: u32, y: I) -> I {
x %= y;
for _ in 0..e {
x <<= 1;
x = x.checked_sub(y).unwrap_or(x);
fn reduction<F>(mut x: F::Int, e: u32, y: F::Int) -> F::Int
where
F: Float,
F::Int: HInt,
<<F as Float>::Int as HInt>::D: NarrowingDiv,
{
// `f16` only has 5 exponent bits, so even `f16::MAX = 65504.0` is only
// a 40-bit integer multiple of the smallest subnormal.
if F::BITS == 16 {
debug_assert!(F::EXP_MAX - F::EXP_MIN == 29);
debug_assert!(e <= 29);
let u: u16 = x.cast();
let v: u16 = y.cast();
let u = (u as u64) << e;
let v = v as u64;
return F::Int::cast_from((u % v) as u16);
}
x
// Ensure `x < 2y` for later steps
if x >= (y << 1) {
// This case is only reached with subnormal divisors,
// but it might be better to just normalize all significands
// to make this unnecessary. The further calls could potentially
// benefit from assuming a specific fixed leading bit position.
x %= y;
}
// The simple implementation seems to be fastest for a short reduction
// at this size. The limit here was chosen empirically on an Intel Nehalem.
// Less old CPUs that have faster `u64 * u64 -> u128` might not benefit,
// and 32-bit systems or architectures without hardware multipliers might
// want to do this in more cases.
if F::BITS == 64 && e < 32 {
// Assumes `x < 2y`
for _ in 0..e {
x = x.checked_sub(y).unwrap_or(x);
x <<= 1;
}
return x.checked_sub(y).unwrap_or(x);
}
// Fast path for short reductions
if e < F::BITS {
let w = x.widen() << e;
if let Some((_, r)) = w.checked_narrowing_div_rem(y) {
return r;
}
}
// Assumes `x < 2y`
crate::support::linear_mul_reduction(x, e, y)
}

View file

@ -29,9 +29,7 @@ pub use hex_float::hf16;
pub use hex_float::hf128;
#[allow(unused_imports)]
pub use hex_float::{hf32, hf64};
#[allow(unused_imports)]
pub use int_traits::{CastFrom, CastInto, DInt, HInt, Int, MinInt, NarrowingDiv};
#[allow(unused_imports)]
pub use modular::linear_mul_reduction;
/// Hint to the compiler that the current path is cold.

View file

@ -14,7 +14,6 @@ use crate::support::{DInt, HInt, Int};
/// Compute the remainder `(x << e) % y` with unbounded integers.
/// Requires `x < 2y` and `y.leading_zeros() >= 2`
#[allow(dead_code)]
pub fn linear_mul_reduction<U>(x: U, mut e: u32, mut y: U) -> U
where
U: HInt + Int<Unsigned = U>,