Add scalbnf16, scalbnf128, ldexpf16, and ldexpf128
Use the generic `scalbn` to provide `f16` and `f128` versions, which also work for `ldexp`. This involves a new algorithm for `f16` because the default does not converge fast enough with a limited number of rounds.
This commit is contained in:
parent
98bee053ef
commit
cc2874c9a9
15 changed files with 195 additions and 39 deletions
|
|
@ -134,6 +134,13 @@ const ALL_OPERATIONS_NESTED: &[(FloatTy, Signature, Option<Signature>, &[&str])]
|
|||
None,
|
||||
&["jn", "yn"],
|
||||
),
|
||||
(
|
||||
// `(f16, i32) -> f16`
|
||||
FloatTy::F16,
|
||||
Signature { args: &[Ty::F16, Ty::I32], returns: &[Ty::F16] },
|
||||
None,
|
||||
&["scalbnf16", "ldexpf16"],
|
||||
),
|
||||
(
|
||||
// `(f32, i32) -> f32`
|
||||
FloatTy::F32,
|
||||
|
|
@ -148,6 +155,13 @@ const ALL_OPERATIONS_NESTED: &[(FloatTy, Signature, Option<Signature>, &[&str])]
|
|||
None,
|
||||
&["scalbn", "ldexp"],
|
||||
),
|
||||
(
|
||||
// `(f128, i32) -> f128`
|
||||
FloatTy::F128,
|
||||
Signature { args: &[Ty::F128, Ty::I32], returns: &[Ty::F128] },
|
||||
None,
|
||||
&["scalbnf128", "ldexpf128"],
|
||||
),
|
||||
(
|
||||
// `(f32, &mut f32) -> f32` as `(f32) -> (f32, f32)`
|
||||
FloatTy::F32,
|
||||
|
|
|
|||
|
|
@ -131,6 +131,8 @@ main!(
|
|||
icount_bench_jn_group,
|
||||
icount_bench_jnf_group,
|
||||
icount_bench_ldexp_group,
|
||||
icount_bench_ldexpf128_group,
|
||||
icount_bench_ldexpf16_group,
|
||||
icount_bench_ldexpf_group,
|
||||
icount_bench_lgamma_group,
|
||||
icount_bench_lgamma_r_group,
|
||||
|
|
@ -163,6 +165,8 @@ main!(
|
|||
icount_bench_roundf16_group,
|
||||
icount_bench_roundf_group,
|
||||
icount_bench_scalbn_group,
|
||||
icount_bench_scalbnf128_group,
|
||||
icount_bench_scalbnf16_group,
|
||||
icount_bench_scalbnf_group,
|
||||
icount_bench_sin_group,
|
||||
icount_bench_sinf_group,
|
||||
|
|
|
|||
|
|
@ -133,10 +133,14 @@ libm_macros::for_each_function! {
|
|||
| fminf16
|
||||
| fmodf128
|
||||
| fmodf16
|
||||
| ldexpf128
|
||||
| ldexpf16
|
||||
| rintf128
|
||||
| rintf16
|
||||
| roundf128
|
||||
| roundf16
|
||||
| scalbnf128
|
||||
| scalbnf16
|
||||
| sqrtf128
|
||||
| sqrtf16
|
||||
| truncf128
|
||||
|
|
|
|||
|
|
@ -159,6 +159,8 @@ libm_macros::for_each_function! {
|
|||
jnf,
|
||||
ldexp,
|
||||
ldexpf,
|
||||
ldexpf128,
|
||||
ldexpf16,
|
||||
lgamma_r,
|
||||
lgammaf_r,
|
||||
modf,
|
||||
|
|
@ -178,6 +180,8 @@ libm_macros::for_each_function! {
|
|||
roundf16,
|
||||
scalbn,
|
||||
scalbnf,
|
||||
scalbnf128,
|
||||
scalbnf16,
|
||||
sincos,sincosf,
|
||||
trunc,
|
||||
truncf,
|
||||
|
|
@ -351,34 +355,6 @@ macro_rules! impl_op_for_ty {
|
|||
}
|
||||
}
|
||||
|
||||
// `ldexp` and `scalbn` are the same for binary floating point, so just forward all
|
||||
// methods.
|
||||
impl MpOp for crate::op::[<ldexp $suffix>]::Routine {
|
||||
type MpTy = <crate::op::[<scalbn $suffix>]::Routine as MpOp>::MpTy;
|
||||
|
||||
fn new_mp() -> Self::MpTy {
|
||||
<crate::op::[<scalbn $suffix>]::Routine as MpOp>::new_mp()
|
||||
}
|
||||
|
||||
fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
|
||||
<crate::op::[<scalbn $suffix>]::Routine as MpOp>::run(this, input)
|
||||
}
|
||||
}
|
||||
|
||||
impl MpOp for crate::op::[<scalbn $suffix>]::Routine {
|
||||
type MpTy = MpFloat;
|
||||
|
||||
fn new_mp() -> Self::MpTy {
|
||||
new_mpfloat::<Self::FTy>()
|
||||
}
|
||||
|
||||
fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
|
||||
this.assign(input.0);
|
||||
*this <<= input.1;
|
||||
prep_retval::<Self::FTy>(this, Ordering::Equal)
|
||||
}
|
||||
}
|
||||
|
||||
impl MpOp for crate::op::[<sincos $suffix>]::Routine {
|
||||
type MpTy = (MpFloat, MpFloat);
|
||||
|
||||
|
|
@ -464,6 +440,35 @@ macro_rules! impl_op_for_ty_all {
|
|||
this.1.assign(input.1);
|
||||
let ord = this.0.rem_assign_round(&this.1, Nearest);
|
||||
prep_retval::<Self::RustRet>(&mut this.0, ord)
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// `ldexp` and `scalbn` are the same for binary floating point, so just forward all
|
||||
// methods.
|
||||
impl MpOp for crate::op::[<ldexp $suffix>]::Routine {
|
||||
type MpTy = <crate::op::[<scalbn $suffix>]::Routine as MpOp>::MpTy;
|
||||
|
||||
fn new_mp() -> Self::MpTy {
|
||||
<crate::op::[<scalbn $suffix>]::Routine as MpOp>::new_mp()
|
||||
}
|
||||
|
||||
fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
|
||||
<crate::op::[<scalbn $suffix>]::Routine as MpOp>::run(this, input)
|
||||
}
|
||||
}
|
||||
|
||||
impl MpOp for crate::op::[<scalbn $suffix>]::Routine {
|
||||
type MpTy = MpFloat;
|
||||
|
||||
fn new_mp() -> Self::MpTy {
|
||||
new_mpfloat::<Self::FTy>()
|
||||
}
|
||||
|
||||
fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
|
||||
this.assign(input.0);
|
||||
*this <<= input.1;
|
||||
prep_retval::<Self::FTy>(this, Ordering::Equal)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -551,8 +551,12 @@ fn int_float_common<F1: Float, F2: Float>(
|
|||
DEFAULT
|
||||
}
|
||||
|
||||
#[cfg(f16_enabled)]
|
||||
impl MaybeOverride<(f16, i32)> for SpecialCase {}
|
||||
impl MaybeOverride<(f32, i32)> for SpecialCase {}
|
||||
impl MaybeOverride<(f64, i32)> for SpecialCase {}
|
||||
#[cfg(f128_enabled)]
|
||||
impl MaybeOverride<(f128, i32)> for SpecialCase {}
|
||||
|
||||
impl MaybeOverride<(f32, f32, f32)> for SpecialCase {
|
||||
fn check_float<F: Float>(
|
||||
|
|
|
|||
|
|
@ -95,10 +95,14 @@ libm_macros::for_each_function! {
|
|||
fminf16,
|
||||
fmodf128,
|
||||
fmodf16,
|
||||
ldexpf128,
|
||||
ldexpf16,
|
||||
rintf128,
|
||||
rintf16,
|
||||
roundf128,
|
||||
roundf16,
|
||||
scalbnf128,
|
||||
scalbnf16,
|
||||
sqrtf128,
|
||||
sqrtf16,
|
||||
truncf128,
|
||||
|
|
|
|||
|
|
@ -102,10 +102,14 @@ fn do_eval(basis: &str, op: &str, inputs: &[&str]) {
|
|||
| fminf16
|
||||
| fmodf128
|
||||
| fmodf16
|
||||
| ldexpf128
|
||||
| ldexpf16
|
||||
| rintf128
|
||||
| rintf16
|
||||
| roundf128
|
||||
| roundf16
|
||||
| scalbnf128
|
||||
| scalbnf16
|
||||
| sqrtf128
|
||||
| sqrtf16
|
||||
| truncf128
|
||||
|
|
|
|||
|
|
@ -554,6 +554,18 @@
|
|||
],
|
||||
"type": "f32"
|
||||
},
|
||||
"ldexpf128": {
|
||||
"sources": [
|
||||
"src/math/ldexpf128.rs"
|
||||
],
|
||||
"type": "f128"
|
||||
},
|
||||
"ldexpf16": {
|
||||
"sources": [
|
||||
"src/math/ldexpf16.rs"
|
||||
],
|
||||
"type": "f16"
|
||||
},
|
||||
"lgamma": {
|
||||
"sources": [
|
||||
"src/libm_helper.rs",
|
||||
|
|
@ -774,6 +786,20 @@
|
|||
],
|
||||
"type": "f32"
|
||||
},
|
||||
"scalbnf128": {
|
||||
"sources": [
|
||||
"src/math/generic/scalbn.rs",
|
||||
"src/math/scalbnf128.rs"
|
||||
],
|
||||
"type": "f128"
|
||||
},
|
||||
"scalbnf16": {
|
||||
"sources": [
|
||||
"src/math/generic/scalbn.rs",
|
||||
"src/math/scalbnf16.rs"
|
||||
],
|
||||
"type": "f16"
|
||||
},
|
||||
"sin": {
|
||||
"sources": [
|
||||
"src/libm_helper.rs",
|
||||
|
|
|
|||
|
|
@ -79,6 +79,8 @@ jn
|
|||
jnf
|
||||
ldexp
|
||||
ldexpf
|
||||
ldexpf128
|
||||
ldexpf16
|
||||
lgamma
|
||||
lgamma_r
|
||||
lgammaf
|
||||
|
|
@ -111,6 +113,8 @@ roundf128
|
|||
roundf16
|
||||
scalbn
|
||||
scalbnf
|
||||
scalbnf128
|
||||
scalbnf16
|
||||
sin
|
||||
sincos
|
||||
sincosf
|
||||
|
|
|
|||
|
|
@ -31,16 +31,27 @@ where
|
|||
let exp_max: i32 = F::EXP_BIAS as i32;
|
||||
let exp_min = -(exp_max - 1);
|
||||
|
||||
// 2 ^ Emax, where Emax is the maximum biased exponent value (1023 for f64)
|
||||
// 2 ^ Emax, maximum positive with null significand (0x1p1023 for f64)
|
||||
let f_exp_max = F::from_parts(false, F::EXP_BIAS << 1, zero);
|
||||
|
||||
// 2 ^ Emin, where Emin is the minimum biased exponent value (-1022 for f64)
|
||||
// 2 ^ Emin, minimum positive normal with null significand (0x1p-1022 for f64)
|
||||
let f_exp_min = F::from_parts(false, 1, zero);
|
||||
|
||||
// 2 ^ sig_total_bits, representation of what can be accounted for with subnormals
|
||||
let f_exp_subnorm = F::from_parts(false, sig_total_bits + F::EXP_BIAS, zero);
|
||||
// 2 ^ sig_total_bits, moltiplier to normalize subnormals (0x1p53 for f64)
|
||||
let f_pow_subnorm = F::from_parts(false, sig_total_bits + F::EXP_BIAS, zero);
|
||||
|
||||
/*
|
||||
* The goal is to multiply `x` by a scale factor that applies `n`. However, there are cases
|
||||
* where `2^n` is not representable by `F` but the result should be, e.g. `x = 2^Emin` with
|
||||
* `n = -EMin + 2` (one out of range of 2^Emax). To get around this, reduce the magnitude of
|
||||
* the final scale operation by prescaling by the max/min power representable by `F`.
|
||||
*/
|
||||
|
||||
if n > exp_max {
|
||||
// Worse case positive `n`: `x` is the minimum subnormal value, the result is `F::MAX`.
|
||||
// This can be reached by three scaling multiplications (two here and one final).
|
||||
debug_assert!(-exp_min + F::SIG_BITS as i32 + exp_max <= exp_max * 3);
|
||||
|
||||
x *= f_exp_max;
|
||||
n -= exp_max;
|
||||
if n > exp_max {
|
||||
|
|
@ -51,21 +62,61 @@ where
|
|||
}
|
||||
}
|
||||
} else if n < exp_min {
|
||||
let mul = f_exp_min * f_exp_subnorm;
|
||||
let add = (exp_max - 1) - sig_total_bits as i32;
|
||||
// When scaling toward 0, the prescaling is limited to a value that does not allow `x` to
|
||||
// go subnormal. This avoids double rounding.
|
||||
if F::BITS > 16 {
|
||||
// `mul` s.t. `!(x * mul).is_subnormal() ∀ x`
|
||||
let mul = f_exp_min * f_pow_subnorm;
|
||||
let add = -exp_min - sig_total_bits as i32;
|
||||
|
||||
// Worse case negative `n`: `x` is the maximum positive value, the result is `F::MIN`.
|
||||
// This must be reachable by three scaling multiplications (two here and one final).
|
||||
debug_assert!(-exp_min + F::SIG_BITS as i32 + exp_max <= add * 2 + -exp_min);
|
||||
|
||||
x *= mul;
|
||||
n += add;
|
||||
if n < exp_min {
|
||||
x *= mul;
|
||||
n += add;
|
||||
|
||||
if n < exp_min {
|
||||
n = exp_min;
|
||||
x *= mul;
|
||||
n += add;
|
||||
|
||||
if n < exp_min {
|
||||
n = exp_min;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// `f16` is unique compared to other float types in that the difference between the
|
||||
// minimum exponent and the significand bits (`add = -exp_min - sig_total_bits`) is
|
||||
// small, only three. The above method depend on decrementing `n` by `add` two times;
|
||||
// for other float types this works out because `add` is a substantial fraction of
|
||||
// the exponent range. For `f16`, however, 3 is relatively small compared to the
|
||||
// exponent range (which is 39), so that requires ~10 prescale rounds rather than two.
|
||||
//
|
||||
// Work aroudn this by using a different algorithm that calculates the prescale
|
||||
// dynamically based on the maximum possible value. This adds more operations per round
|
||||
// since it needs to construct the scale, but works better in the general case.
|
||||
let add = -(n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32);
|
||||
let mul = F::from_parts(false, (F::EXP_BIAS as i32 - add) as u32, zero);
|
||||
|
||||
x *= mul;
|
||||
n += add;
|
||||
|
||||
if n < exp_min {
|
||||
let add = -(n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32);
|
||||
let mul = F::from_parts(false, (F::EXP_BIAS as i32 - add) as u32, zero);
|
||||
|
||||
x *= mul;
|
||||
n += add;
|
||||
|
||||
if n < exp_min {
|
||||
n = exp_min;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
x * F::from_parts(false, (F::EXP_BIAS as i32 + n) as u32, zero)
|
||||
let scale = F::from_parts(false, (F::EXP_BIAS as i32 + n) as u32, zero);
|
||||
x * scale
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
@ -111,6 +162,12 @@ mod tests {
|
|||
assert!(scalbn(-F::NAN, -10).is_nan());
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(f16_enabled)]
|
||||
fn spec_test_f16() {
|
||||
spec_test::<f16>();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn spec_test_f32() {
|
||||
spec_test::<f32>();
|
||||
|
|
@ -120,4 +177,10 @@ mod tests {
|
|||
fn spec_test_f64() {
|
||||
spec_test::<f64>();
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(f128_enabled)]
|
||||
fn spec_test_f128() {
|
||||
spec_test::<f128>();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
4
library/compiler-builtins/libm/src/math/ldexpf128.rs
Normal file
4
library/compiler-builtins/libm/src/math/ldexpf128.rs
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
|
||||
pub fn ldexpf128(x: f128, n: i32) -> f128 {
|
||||
super::scalbnf128(x, n)
|
||||
}
|
||||
4
library/compiler-builtins/libm/src/math/ldexpf16.rs
Normal file
4
library/compiler-builtins/libm/src/math/ldexpf16.rs
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
|
||||
pub fn ldexpf16(x: f16, n: i32) -> f16 {
|
||||
super::scalbnf16(x, n)
|
||||
}
|
||||
|
|
@ -349,8 +349,10 @@ cfg_if! {
|
|||
mod fmaxf16;
|
||||
mod fminf16;
|
||||
mod fmodf16;
|
||||
mod ldexpf16;
|
||||
mod rintf16;
|
||||
mod roundf16;
|
||||
mod scalbnf16;
|
||||
mod sqrtf16;
|
||||
mod truncf16;
|
||||
|
||||
|
|
@ -362,8 +364,10 @@ cfg_if! {
|
|||
pub use self::fmaxf16::fmaxf16;
|
||||
pub use self::fminf16::fminf16;
|
||||
pub use self::fmodf16::fmodf16;
|
||||
pub use self::ldexpf16::ldexpf16;
|
||||
pub use self::rintf16::rintf16;
|
||||
pub use self::roundf16::roundf16;
|
||||
pub use self::scalbnf16::scalbnf16;
|
||||
pub use self::sqrtf16::sqrtf16;
|
||||
pub use self::truncf16::truncf16;
|
||||
}
|
||||
|
|
@ -379,8 +383,10 @@ cfg_if! {
|
|||
mod fmaxf128;
|
||||
mod fminf128;
|
||||
mod fmodf128;
|
||||
mod ldexpf128;
|
||||
mod rintf128;
|
||||
mod roundf128;
|
||||
mod scalbnf128;
|
||||
mod sqrtf128;
|
||||
mod truncf128;
|
||||
|
||||
|
|
@ -392,8 +398,10 @@ cfg_if! {
|
|||
pub use self::fmaxf128::fmaxf128;
|
||||
pub use self::fminf128::fminf128;
|
||||
pub use self::fmodf128::fmodf128;
|
||||
pub use self::ldexpf128::ldexpf128;
|
||||
pub use self::rintf128::rintf128;
|
||||
pub use self::roundf128::roundf128;
|
||||
pub use self::scalbnf128::scalbnf128;
|
||||
pub use self::sqrtf128::sqrtf128;
|
||||
pub use self::truncf128::truncf128;
|
||||
}
|
||||
|
|
|
|||
4
library/compiler-builtins/libm/src/math/scalbnf128.rs
Normal file
4
library/compiler-builtins/libm/src/math/scalbnf128.rs
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
|
||||
pub fn scalbnf128(x: f128, n: i32) -> f128 {
|
||||
super::generic::scalbn(x, n)
|
||||
}
|
||||
4
library/compiler-builtins/libm/src/math/scalbnf16.rs
Normal file
4
library/compiler-builtins/libm/src/math/scalbnf16.rs
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
|
||||
pub fn scalbnf16(x: f16, n: i32) -> f16 {
|
||||
super::generic::scalbn(x, n)
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue