fix division on SPARC (#393)
This commit is contained in:
parent
82bee10b5b
commit
a7548bea0d
3 changed files with 190 additions and 27 deletions
|
|
@ -185,3 +185,133 @@ macro_rules! impl_delegate {
|
|||
}
|
||||
};
|
||||
}
|
||||
|
||||
/// Returns `n / d` and sets `*rem = n % d`.
|
||||
///
|
||||
/// This specialization exists because:
|
||||
/// - The LLVM backend for 32-bit SPARC cannot compile functions that return `(u128, u128)`,
|
||||
/// so we have to use an old fashioned `&mut u128` argument to return the remainder.
|
||||
/// - 64-bit SPARC does not have u64 * u64 => u128 widening multiplication, which makes the
|
||||
/// delegate algorithm strategy the only reasonably fast way to perform `u128` division.
|
||||
#[doc(hidden)]
|
||||
pub fn u128_divide_sparc(duo: u128, div: u128, rem: &mut u128) -> u128 {
|
||||
use super::*;
|
||||
let duo_lo = duo as u64;
|
||||
let duo_hi = (duo >> 64) as u64;
|
||||
let div_lo = div as u64;
|
||||
let div_hi = (div >> 64) as u64;
|
||||
|
||||
match (div_lo == 0, div_hi == 0, duo_hi == 0) {
|
||||
(true, true, _) => zero_div_fn(),
|
||||
(_, false, true) => {
|
||||
*rem = duo;
|
||||
return 0;
|
||||
}
|
||||
(false, true, true) => {
|
||||
let tmp = u64_by_u64_div_rem(duo_lo, div_lo);
|
||||
*rem = tmp.1 as u128;
|
||||
return tmp.0 as u128;
|
||||
}
|
||||
(false, true, false) => {
|
||||
if duo_hi < div_lo {
|
||||
let norm_shift = u64_normalization_shift(div_lo, duo_hi, false);
|
||||
let shl = if norm_shift == 0 {
|
||||
64 - 1
|
||||
} else {
|
||||
64 - norm_shift
|
||||
};
|
||||
|
||||
let mut div: u128 = div << shl;
|
||||
let mut pow_lo: u64 = 1 << shl;
|
||||
let mut quo_lo: u64 = 0;
|
||||
let mut duo = duo;
|
||||
loop {
|
||||
let sub = duo.wrapping_sub(div);
|
||||
if 0 <= (sub as i128) {
|
||||
duo = sub;
|
||||
quo_lo |= pow_lo;
|
||||
let duo_hi = (duo >> 64) as u64;
|
||||
if duo_hi == 0 {
|
||||
let tmp = u64_by_u64_div_rem(duo as u64, div_lo);
|
||||
*rem = tmp.1 as u128;
|
||||
return (quo_lo | tmp.0) as u128;
|
||||
}
|
||||
}
|
||||
div >>= 1;
|
||||
pow_lo >>= 1;
|
||||
}
|
||||
} else if duo_hi == div_lo {
|
||||
let tmp = u64_by_u64_div_rem(duo as u64, div as u64);
|
||||
*rem = tmp.1 as u128;
|
||||
return (1 << 64) | (tmp.0 as u128);
|
||||
} else {
|
||||
if (div_lo >> 32) == 0 {
|
||||
let div_0 = div_lo as u32 as u64;
|
||||
let (quo_hi, rem_3) = u64_by_u64_div_rem(duo_hi, div_0);
|
||||
|
||||
let duo_mid = ((duo >> 32) as u32 as u64) | (rem_3 << 32);
|
||||
let (quo_1, rem_2) = u64_by_u64_div_rem(duo_mid, div_0);
|
||||
|
||||
let duo_lo = (duo as u32 as u64) | (rem_2 << 32);
|
||||
let (quo_0, rem_1) = u64_by_u64_div_rem(duo_lo, div_0);
|
||||
|
||||
*rem = rem_1 as u128;
|
||||
return (quo_0 as u128) | ((quo_1 as u128) << 32) | ((quo_hi as u128) << 64);
|
||||
}
|
||||
|
||||
let duo_lo = duo as u64;
|
||||
let tmp = u64_by_u64_div_rem(duo_hi, div_lo);
|
||||
let quo_hi = tmp.0;
|
||||
let mut duo = (duo_lo as u128) | ((tmp.1 as u128) << 64);
|
||||
if duo < div {
|
||||
*rem = duo;
|
||||
return (quo_hi as u128) << 64;
|
||||
}
|
||||
|
||||
let mut div: u128 = div << (64 - 1);
|
||||
let mut pow_lo: u64 = 1 << (64 - 1);
|
||||
let mut quo_lo: u64 = 0;
|
||||
loop {
|
||||
let sub = duo.wrapping_sub(div);
|
||||
if 0 <= (sub as i128) {
|
||||
duo = sub;
|
||||
quo_lo |= pow_lo;
|
||||
let duo_hi = (duo >> 64) as u64;
|
||||
if duo_hi == 0 {
|
||||
let tmp = u64_by_u64_div_rem(duo as u64, div_lo);
|
||||
*rem = tmp.1 as u128;
|
||||
return (tmp.0) as u128 | (quo_lo as u128) | ((quo_hi as u128) << 64);
|
||||
}
|
||||
}
|
||||
div >>= 1;
|
||||
pow_lo >>= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
(_, false, false) => {
|
||||
if duo < div {
|
||||
*rem = duo;
|
||||
return 0;
|
||||
}
|
||||
let div_original = div;
|
||||
let shl = u64_normalization_shift(duo_hi, div_hi, false);
|
||||
let mut duo = duo;
|
||||
let mut div: u128 = div << shl;
|
||||
let mut pow_lo: u64 = 1 << shl;
|
||||
let mut quo_lo: u64 = 0;
|
||||
loop {
|
||||
let sub = duo.wrapping_sub(div);
|
||||
if 0 <= (sub as i128) {
|
||||
duo = sub;
|
||||
quo_lo |= pow_lo;
|
||||
if duo < div_original {
|
||||
*rem = duo;
|
||||
return quo_lo as u128;
|
||||
}
|
||||
}
|
||||
div >>= 1;
|
||||
pow_lo >>= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -46,6 +46,7 @@ mod binary_long;
|
|||
|
||||
#[macro_use]
|
||||
mod delegate;
|
||||
pub use self::delegate::u128_divide_sparc;
|
||||
|
||||
#[macro_use]
|
||||
mod trifecta;
|
||||
|
|
@ -60,27 +61,31 @@ fn zero_div_fn() -> ! {
|
|||
unsafe { core::hint::unreachable_unchecked() }
|
||||
}
|
||||
|
||||
// The `B` extension on RISC-V determines if a CLZ assembly instruction exists
|
||||
#[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
|
||||
const USE_LZ: bool = cfg!(target_feature = "b");
|
||||
|
||||
#[cfg(target_arch = "arm")]
|
||||
const USE_LZ: bool = if cfg!(target_feature = "thumb-mode") {
|
||||
// ARM thumb targets have CLZ instructions if the instruction set of ARMv6T2 is supported. This
|
||||
// is needed to successfully differentiate between targets like `thumbv8.base` and
|
||||
// `thumbv8.main`.
|
||||
cfg!(target_feature = "v6t2")
|
||||
} else {
|
||||
// Regular ARM targets have CLZ instructions if the ARMv5TE instruction set is supported.
|
||||
// Technically, ARMv5T was the first to have CLZ, but the "v5t" target feature does not seem to
|
||||
// work.
|
||||
cfg!(target_feature = "v5te")
|
||||
const USE_LZ: bool = {
|
||||
if cfg!(target_arch = "arm") {
|
||||
if cfg!(target_feature = "thumb-mode") {
|
||||
// ARM thumb targets have CLZ instructions if the instruction set of ARMv6T2 is
|
||||
// supported. This is needed to successfully differentiate between targets like
|
||||
// `thumbv8.base` and `thumbv8.main`.
|
||||
cfg!(target_feature = "v6t2")
|
||||
} else {
|
||||
// Regular ARM targets have CLZ instructions if the ARMv5TE instruction set is
|
||||
// supported. Technically, ARMv5T was the first to have CLZ, but the "v5t" target
|
||||
// feature does not seem to work.
|
||||
cfg!(target_feature = "v5te")
|
||||
}
|
||||
} else if cfg!(any(target_arch = "sparc", target_arch = "sparc64")) {
|
||||
// LZD or LZCNT on SPARC only exists for the VIS 3 extension and later.
|
||||
cfg!(target_feature = "vis3")
|
||||
} else if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) {
|
||||
// The `B` extension on RISC-V determines if a CLZ assembly instruction exists
|
||||
cfg!(target_feature = "b")
|
||||
} else {
|
||||
// All other common targets Rust supports should have CLZ instructions
|
||||
true
|
||||
}
|
||||
};
|
||||
|
||||
// All other targets Rust supports have CLZ instructions
|
||||
#[cfg(not(any(target_arch = "arm", target_arch = "riscv32", target_arch = "riscv64")))]
|
||||
const USE_LZ: bool = true;
|
||||
|
||||
impl_normalization_shift!(
|
||||
u32_normalization_shift,
|
||||
USE_LZ,
|
||||
|
|
@ -115,8 +120,9 @@ fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) {
|
|||
// microarchitecture can multiply and divide. We decide to be optimistic and assume `trifecta` is
|
||||
// faster if the target pointer width is at least 64.
|
||||
#[cfg(all(
|
||||
not(any(target_pointer_width = "16", target_pointer_width = "32")),
|
||||
not(all(not(feature = "no-asm"), target_arch = "x86_64")),
|
||||
not(any(target_pointer_width = "16", target_pointer_width = "32"))
|
||||
not(any(target_arch = "sparc", target_arch = "sparc64"))
|
||||
))]
|
||||
impl_trifecta!(
|
||||
u128_div_rem,
|
||||
|
|
@ -131,8 +137,9 @@ impl_trifecta!(
|
|||
// If the pointer width less than 64, then the target architecture almost certainly does not have
|
||||
// the fast 64 to 128 bit widening multiplication needed for `trifecta` to be faster.
|
||||
#[cfg(all(
|
||||
any(target_pointer_width = "16", target_pointer_width = "32"),
|
||||
not(all(not(feature = "no-asm"), target_arch = "x86_64")),
|
||||
any(target_pointer_width = "16", target_pointer_width = "32")
|
||||
not(any(target_arch = "sparc", target_arch = "sparc64"))
|
||||
))]
|
||||
impl_delegate!(
|
||||
u128_div_rem,
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
pub use int::specialized_div_rem::u128_divide_sparc;
|
||||
use int::specialized_div_rem::*;
|
||||
|
||||
intrinsics! {
|
||||
|
|
@ -46,25 +47,50 @@ intrinsics! {
|
|||
quo_rem.0
|
||||
}
|
||||
|
||||
// Note: we use block configuration and not `if cfg!(...)`, because we need to entirely disable
|
||||
// the existence of `u128_div_rem` to get 32-bit SPARC to compile, see `u128_divide_sparc` docs.
|
||||
|
||||
#[win64_128bit_abi_hack]
|
||||
/// Returns `n / d`
|
||||
pub extern "C" fn __udivti3(n: u128, d: u128) -> u128 {
|
||||
u128_div_rem(n, d).0
|
||||
#[cfg(not(any(target_arch = "sparc", target_arch = "sparc64")))] {
|
||||
u128_div_rem(n, d).0
|
||||
}
|
||||
#[cfg(any(target_arch = "sparc", target_arch = "sparc64"))] {
|
||||
u128_divide_sparc(n, d, &mut 0)
|
||||
}
|
||||
}
|
||||
|
||||
#[win64_128bit_abi_hack]
|
||||
/// Returns `n % d`
|
||||
pub extern "C" fn __umodti3(n: u128, d: u128) -> u128 {
|
||||
u128_div_rem(n, d).1
|
||||
#[cfg(not(any(target_arch = "sparc", target_arch = "sparc64")))] {
|
||||
u128_div_rem(n, d).1
|
||||
}
|
||||
#[cfg(any(target_arch = "sparc", target_arch = "sparc64"))] {
|
||||
let mut rem = 0;
|
||||
u128_divide_sparc(n, d, &mut rem);
|
||||
rem
|
||||
}
|
||||
}
|
||||
|
||||
#[win64_128bit_abi_hack]
|
||||
/// Returns `n / d` and sets `*rem = n % d`
|
||||
pub extern "C" fn __udivmodti4(n: u128, d: u128, rem: Option<&mut u128>) -> u128 {
|
||||
let quo_rem = u128_div_rem(n, d);
|
||||
if let Some(rem) = rem {
|
||||
*rem = quo_rem.1;
|
||||
#[cfg(not(any(target_arch = "sparc", target_arch = "sparc64")))] {
|
||||
let quo_rem = u128_div_rem(n, d);
|
||||
if let Some(rem) = rem {
|
||||
*rem = quo_rem.1;
|
||||
}
|
||||
quo_rem.0
|
||||
}
|
||||
#[cfg(any(target_arch = "sparc", target_arch = "sparc64"))] {
|
||||
let mut tmp = 0;
|
||||
let quo = u128_divide_sparc(n, d, &mut tmp);
|
||||
if let Some(rem) = rem {
|
||||
*rem = tmp;
|
||||
}
|
||||
quo
|
||||
}
|
||||
quo_rem.0
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue