Merge pull request #380 from AaronKutch/division-tweaks
This commit is contained in:
commit
a918047f92
8 changed files with 247 additions and 413 deletions
|
|
@ -4,6 +4,7 @@
|
|||
// Compilers will insert the check for zero in cases where it is needed.
|
||||
|
||||
/// Returns the number of leading binary zeros in `x`.
|
||||
#[doc(hidden)]
|
||||
pub fn usize_leading_zeros_default(x: usize) -> usize {
|
||||
// The basic idea is to test if the higher bits of `x` are zero and bisect the number
|
||||
// of leading zeros. It is possible for all branches of the bisection to use the same
|
||||
|
|
@ -75,6 +76,7 @@ pub fn usize_leading_zeros_default(x: usize) -> usize {
|
|||
// RISC-V that allows `(x >= power-of-two) as usize` to be branchless.
|
||||
|
||||
/// Returns the number of leading binary zeros in `x`.
|
||||
#[doc(hidden)]
|
||||
pub fn usize_leading_zeros_riscv(x: usize) -> usize {
|
||||
let mut x = x;
|
||||
// the number of potential leading zeros
|
||||
|
|
|
|||
|
|
@ -1,65 +1,166 @@
|
|||
use int::specialized_div_rem::*;
|
||||
use int::udiv::*;
|
||||
|
||||
macro_rules! sdivmod {
|
||||
(
|
||||
$unsigned_fn:ident, // name of the unsigned division function
|
||||
$signed_fn:ident, // name of the signed division function
|
||||
$uX:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name`
|
||||
$iX:ident, // signed integer type for the inputs and outputs of `$signed_name`
|
||||
$($attr:tt),* // attributes
|
||||
) => {
|
||||
intrinsics! {
|
||||
$(
|
||||
#[$attr]
|
||||
)*
|
||||
/// Returns `n / d` and sets `*rem = n % d`
|
||||
pub extern "C" fn $signed_fn(a: $iX, b: $iX, rem: &mut $iX) -> $iX {
|
||||
let a_neg = a < 0;
|
||||
let b_neg = b < 0;
|
||||
let mut a = a;
|
||||
let mut b = b;
|
||||
if a_neg {
|
||||
a = a.wrapping_neg();
|
||||
}
|
||||
if b_neg {
|
||||
b = b.wrapping_neg();
|
||||
}
|
||||
let mut r = *rem as $uX;
|
||||
let t = $unsigned_fn(a as $uX, b as $uX, Some(&mut r)) as $iX;
|
||||
let mut r = r as $iX;
|
||||
if a_neg {
|
||||
r = r.wrapping_neg();
|
||||
}
|
||||
*rem = r;
|
||||
if a_neg != b_neg {
|
||||
t.wrapping_neg()
|
||||
} else {
|
||||
t
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! sdiv {
|
||||
(
|
||||
$unsigned_fn:ident, // name of the unsigned division function
|
||||
$signed_fn:ident, // name of the signed division function
|
||||
$uX:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name`
|
||||
$iX:ident, // signed integer type for the inputs and outputs of `$signed_name`
|
||||
$($attr:tt),* // attributes
|
||||
) => {
|
||||
intrinsics! {
|
||||
$(
|
||||
#[$attr]
|
||||
)*
|
||||
/// Returns `n / d`
|
||||
pub extern "C" fn $signed_fn(a: $iX, b: $iX) -> $iX {
|
||||
let a_neg = a < 0;
|
||||
let b_neg = b < 0;
|
||||
let mut a = a;
|
||||
let mut b = b;
|
||||
if a_neg {
|
||||
a = a.wrapping_neg();
|
||||
}
|
||||
if b_neg {
|
||||
b = b.wrapping_neg();
|
||||
}
|
||||
let t = $unsigned_fn(a as $uX, b as $uX) as $iX;
|
||||
if a_neg != b_neg {
|
||||
t.wrapping_neg()
|
||||
} else {
|
||||
t
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! smod {
|
||||
(
|
||||
$unsigned_fn:ident, // name of the unsigned division function
|
||||
$signed_fn:ident, // name of the signed division function
|
||||
$uX:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name`
|
||||
$iX:ident, // signed integer type for the inputs and outputs of `$signed_name`
|
||||
$($attr:tt),* // attributes
|
||||
) => {
|
||||
intrinsics! {
|
||||
$(
|
||||
#[$attr]
|
||||
)*
|
||||
/// Returns `n % d`
|
||||
pub extern "C" fn $signed_fn(a: $iX, b: $iX) -> $iX {
|
||||
let a_neg = a < 0;
|
||||
let b_neg = b < 0;
|
||||
let mut a = a;
|
||||
let mut b = b;
|
||||
if a_neg {
|
||||
a = a.wrapping_neg();
|
||||
}
|
||||
if b_neg {
|
||||
b = b.wrapping_neg();
|
||||
}
|
||||
let r = $unsigned_fn(a as $uX, b as $uX) as $iX;
|
||||
if a_neg {
|
||||
r.wrapping_neg()
|
||||
} else {
|
||||
r
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sdivmod!(
|
||||
__udivmodsi4,
|
||||
__divmodsi4,
|
||||
u32,
|
||||
i32,
|
||||
maybe_use_optimized_c_shim
|
||||
);
|
||||
// The `#[arm_aeabi_alias = __aeabi_idiv]` attribute cannot be made to work with `intrinsics!` in macros
|
||||
intrinsics! {
|
||||
#[maybe_use_optimized_c_shim]
|
||||
#[arm_aeabi_alias = __aeabi_idiv]
|
||||
/// Returns `n / d`
|
||||
pub extern "C" fn __divsi3(a: i32, b: i32) -> i32 {
|
||||
i32_div_rem(a, b).0
|
||||
}
|
||||
|
||||
#[maybe_use_optimized_c_shim]
|
||||
/// Returns `n % d`
|
||||
pub extern "C" fn __modsi3(a: i32, b: i32) -> i32 {
|
||||
i32_div_rem(a, b).1
|
||||
}
|
||||
|
||||
#[maybe_use_optimized_c_shim]
|
||||
/// Returns `n / d` and sets `*rem = n % d`
|
||||
pub extern "C" fn __divmodsi4(a: i32, b: i32, rem: &mut i32) -> i32 {
|
||||
let quo_rem = i32_div_rem(a, b);
|
||||
*rem = quo_rem.1;
|
||||
quo_rem.0
|
||||
}
|
||||
|
||||
#[maybe_use_optimized_c_shim]
|
||||
/// Returns `n / d`
|
||||
pub extern "C" fn __divdi3(a: i64, b: i64) -> i64 {
|
||||
i64_div_rem(a, b).0
|
||||
}
|
||||
|
||||
#[maybe_use_optimized_c_shim]
|
||||
/// Returns `n % d`
|
||||
pub extern "C" fn __moddi3(a: i64, b: i64) -> i64 {
|
||||
i64_div_rem(a, b).1
|
||||
}
|
||||
|
||||
#[maybe_use_optimized_c_shim]
|
||||
/// Returns `n / d` and sets `*rem = n % d`
|
||||
pub extern "C" fn __divmoddi4(a: i64, b: i64, rem: &mut i64) -> i64 {
|
||||
let quo_rem = i64_div_rem(a, b);
|
||||
*rem = quo_rem.1;
|
||||
quo_rem.0
|
||||
}
|
||||
|
||||
#[win64_128bit_abi_hack]
|
||||
/// Returns `n / d`
|
||||
pub extern "C" fn __divti3(a: i128, b: i128) -> i128 {
|
||||
i128_div_rem(a, b).0
|
||||
}
|
||||
|
||||
#[win64_128bit_abi_hack]
|
||||
/// Returns `n % d`
|
||||
pub extern "C" fn __modti3(a: i128, b: i128) -> i128 {
|
||||
i128_div_rem(a, b).1
|
||||
}
|
||||
|
||||
// LLVM does not currently have a `__divmodti4` function, but GCC does
|
||||
#[maybe_use_optimized_c_shim]
|
||||
/// Returns `n / d` and sets `*rem = n % d`
|
||||
pub extern "C" fn __divmodti4(a: i128, b: i128, rem: &mut i128) -> i128 {
|
||||
let quo_rem = i128_div_rem(a, b);
|
||||
*rem = quo_rem.1;
|
||||
quo_rem.0
|
||||
let a_neg = a < 0;
|
||||
let b_neg = b < 0;
|
||||
let mut a = a;
|
||||
let mut b = b;
|
||||
if a_neg {
|
||||
a = a.wrapping_neg();
|
||||
}
|
||||
if b_neg {
|
||||
b = b.wrapping_neg();
|
||||
}
|
||||
let t = __udivsi3(a as u32, b as u32) as i32;
|
||||
if a_neg != b_neg {
|
||||
t.wrapping_neg()
|
||||
} else {
|
||||
t
|
||||
}
|
||||
}
|
||||
}
|
||||
smod!(__umodsi3, __modsi3, u32, i32, maybe_use_optimized_c_shim);
|
||||
|
||||
sdivmod!(
|
||||
__udivmoddi4,
|
||||
__divmoddi4,
|
||||
u64,
|
||||
i64,
|
||||
maybe_use_optimized_c_shim
|
||||
);
|
||||
sdiv!(__udivdi3, __divdi3, u64, i64, maybe_use_optimized_c_shim);
|
||||
smod!(__umoddi3, __moddi3, u64, i64, maybe_use_optimized_c_shim);
|
||||
|
||||
// LLVM does not currently have a `__divmodti4` function, but GCC does
|
||||
sdivmod!(
|
||||
__udivmodti4,
|
||||
__divmodti4,
|
||||
u128,
|
||||
i128,
|
||||
maybe_use_optimized_c_shim
|
||||
);
|
||||
sdiv!(__udivti3, __divti3, u128, i128, win64_128bit_abi_hack);
|
||||
smod!(__umodti3, __modti3, u128, i128, win64_128bit_abi_hack);
|
||||
|
|
|
|||
|
|
@ -1,44 +1,26 @@
|
|||
/// Creates unsigned and signed division functions optimized for dividing integers with the same
|
||||
/// Creates an unsigned division function optimized for dividing integers with the same
|
||||
/// bitwidth as the largest operand in an asymmetrically sized division. For example, x86-64 has an
|
||||
/// assembly instruction that can divide a 128 bit integer by a 64 bit integer if the quotient fits
|
||||
/// in 64 bits. The 128 bit version of this algorithm would use that fast hardware division to
|
||||
/// construct a full 128 bit by 128 bit division.
|
||||
#[doc(hidden)]
|
||||
#[macro_export]
|
||||
macro_rules! impl_asymmetric {
|
||||
(
|
||||
$unsigned_name:ident, // name of the unsigned division function
|
||||
$signed_name:ident, // name of the signed division function
|
||||
$fn:ident, // name of the unsigned division function
|
||||
$zero_div_fn:ident, // function called when division by zero is attempted
|
||||
$half_division:ident, // function for division of a $uX by a $uX
|
||||
$asymmetric_division:ident, // function for division of a $uD by a $uX
|
||||
$n_h:expr, // the number of bits in a $iH or $uH
|
||||
$uH:ident, // unsigned integer with half the bit width of $uX
|
||||
$uX:ident, // unsigned integer with half the bit width of $uD
|
||||
$uD:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name`
|
||||
$iD:ident, // signed integer type for the inputs and outputs of `$signed_name`
|
||||
$($unsigned_attr:meta),*; // attributes for the unsigned function
|
||||
$($signed_attr:meta),* // attributes for the signed function
|
||||
$uD:ident // unsigned integer type for the inputs and outputs of `$fn`
|
||||
) => {
|
||||
/// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
|
||||
/// tuple.
|
||||
$(
|
||||
#[$unsigned_attr]
|
||||
)*
|
||||
pub fn $unsigned_name(duo: $uD, div: $uD) -> ($uD,$uD) {
|
||||
fn carrying_mul(lhs: $uX, rhs: $uX) -> ($uX, $uX) {
|
||||
let tmp = (lhs as $uD).wrapping_mul(rhs as $uD);
|
||||
(tmp as $uX, (tmp >> ($n_h * 2)) as $uX)
|
||||
}
|
||||
fn carrying_mul_add(lhs: $uX, mul: $uX, add: $uX) -> ($uX, $uX) {
|
||||
let tmp = (lhs as $uD).wrapping_mul(mul as $uD).wrapping_add(add as $uD);
|
||||
(tmp as $uX, (tmp >> ($n_h * 2)) as $uX)
|
||||
}
|
||||
|
||||
pub fn $fn(duo: $uD, div: $uD) -> ($uD, $uD) {
|
||||
let n: u32 = $n_h * 2;
|
||||
|
||||
// Many of these subalgorithms are taken from trifecta.rs, see that for better
|
||||
// documentation.
|
||||
|
||||
let duo_lo = duo as $uX;
|
||||
let duo_hi = (duo >> n) as $uX;
|
||||
let div_lo = div as $uX;
|
||||
|
|
@ -50,120 +32,39 @@ macro_rules! impl_asymmetric {
|
|||
if duo_hi < div_lo {
|
||||
// `$uD` by `$uX` division with a quotient that will fit into a `$uX`
|
||||
let (quo, rem) = unsafe { $asymmetric_division(duo, div_lo) };
|
||||
return (quo as $uD, rem as $uD)
|
||||
} else if (div_lo >> $n_h) == 0 {
|
||||
// Short division of $uD by a $uH.
|
||||
|
||||
// Some x86_64 CPUs have bad division implementations that make specializing
|
||||
// this case faster.
|
||||
let div_0 = div_lo as $uH as $uX;
|
||||
let (quo_hi, rem_3) = $half_division(duo_hi, div_0);
|
||||
|
||||
let duo_mid =
|
||||
((duo >> $n_h) as $uH as $uX)
|
||||
| (rem_3 << $n_h);
|
||||
let (quo_1, rem_2) = $half_division(duo_mid, div_0);
|
||||
|
||||
let duo_lo =
|
||||
(duo as $uH as $uX)
|
||||
| (rem_2 << $n_h);
|
||||
let (quo_0, rem_1) = $half_division(duo_lo, div_0);
|
||||
|
||||
return (
|
||||
(quo_0 as $uD)
|
||||
| ((quo_1 as $uD) << $n_h)
|
||||
| ((quo_hi as $uD) << n),
|
||||
rem_1 as $uD
|
||||
)
|
||||
return (quo as $uD, rem as $uD);
|
||||
} else {
|
||||
// Short division using the $uD by $uX division
|
||||
let (quo_hi, rem_hi) = $half_division(duo_hi, div_lo);
|
||||
let tmp = unsafe {
|
||||
$asymmetric_division((duo_lo as $uD) | ((rem_hi as $uD) << n), div_lo)
|
||||
};
|
||||
return ((tmp.0 as $uD) | ((quo_hi as $uD) << n), tmp.1 as $uD)
|
||||
return ((tmp.0 as $uD) | ((quo_hi as $uD) << n), tmp.1 as $uD);
|
||||
}
|
||||
}
|
||||
|
||||
let duo_lz = duo_hi.leading_zeros();
|
||||
// This has been adapted from
|
||||
// https://www.codeproject.com/tips/785014/uint-division-modulus which was in turn
|
||||
// adapted from Hacker's Delight. This is similar to the two possibility algorithm
|
||||
// in that it uses only more significant parts of `duo` and `div` to divide a large
|
||||
// integer with a smaller division instruction.
|
||||
let div_lz = div_hi.leading_zeros();
|
||||
let rel_leading_sb = div_lz.wrapping_sub(duo_lz);
|
||||
if rel_leading_sb < $n_h {
|
||||
// Some x86_64 CPUs have bad hardware division implementations that make putting
|
||||
// a two possibility algorithm here beneficial. We also avoid a full `$uD`
|
||||
// multiplication.
|
||||
let shift = n - duo_lz;
|
||||
let duo_sig_n = (duo >> shift) as $uX;
|
||||
let div_sig_n = (div >> shift) as $uX;
|
||||
let quo = $half_division(duo_sig_n, div_sig_n).0;
|
||||
let div_lo = div as $uX;
|
||||
let div_hi = (div >> n) as $uX;
|
||||
let (tmp_lo, carry) = carrying_mul(quo, div_lo);
|
||||
let (tmp_hi, overflow) = carrying_mul_add(quo, div_hi, carry);
|
||||
let tmp = (tmp_lo as $uD) | ((tmp_hi as $uD) << n);
|
||||
if (overflow != 0) || (duo < tmp) {
|
||||
return (
|
||||
(quo - 1) as $uD,
|
||||
duo.wrapping_add(div).wrapping_sub(tmp)
|
||||
)
|
||||
} else {
|
||||
return (
|
||||
quo as $uD,
|
||||
duo - tmp
|
||||
)
|
||||
}
|
||||
} else {
|
||||
// This has been adapted from
|
||||
// https://www.codeproject.com/tips/785014/uint-division-modulus which was in turn
|
||||
// adapted from Hacker's Delight. This is similar to the two possibility algorithm
|
||||
// in that it uses only more significant parts of `duo` and `div` to divide a large
|
||||
// integer with a smaller division instruction.
|
||||
let div_extra = n - div_lz;
|
||||
let div_sig_n = (div >> div_extra) as $uX;
|
||||
let tmp = unsafe { $asymmetric_division(duo >> 1, div_sig_n) };
|
||||
|
||||
let div_extra = n - div_lz;
|
||||
let div_sig_n = (div >> div_extra) as $uX;
|
||||
let tmp = unsafe {
|
||||
$asymmetric_division(duo >> 1, div_sig_n)
|
||||
};
|
||||
|
||||
let mut quo = tmp.0 >> ((n - 1) - div_lz);
|
||||
if quo != 0 {
|
||||
quo -= 1;
|
||||
}
|
||||
|
||||
// Note that this is a full `$uD` multiplication being used here
|
||||
let mut rem = duo - (quo as $uD).wrapping_mul(div);
|
||||
if div <= rem {
|
||||
quo += 1;
|
||||
rem -= div;
|
||||
}
|
||||
return (quo as $uD, rem)
|
||||
let mut quo = tmp.0 >> ((n - 1) - div_lz);
|
||||
if quo != 0 {
|
||||
quo -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
/// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
|
||||
/// tuple.
|
||||
$(
|
||||
#[$signed_attr]
|
||||
)*
|
||||
pub fn $signed_name(duo: $iD, div: $iD) -> ($iD, $iD) {
|
||||
match (duo < 0, div < 0) {
|
||||
(false, false) => {
|
||||
let t = $unsigned_name(duo as $uD, div as $uD);
|
||||
(t.0 as $iD, t.1 as $iD)
|
||||
},
|
||||
(true, false) => {
|
||||
let t = $unsigned_name(duo.wrapping_neg() as $uD, div as $uD);
|
||||
((t.0 as $iD).wrapping_neg(), (t.1 as $iD).wrapping_neg())
|
||||
},
|
||||
(false, true) => {
|
||||
let t = $unsigned_name(duo as $uD, div.wrapping_neg() as $uD);
|
||||
((t.0 as $iD).wrapping_neg(), t.1 as $iD)
|
||||
},
|
||||
(true, true) => {
|
||||
let t = $unsigned_name(duo.wrapping_neg() as $uD, div.wrapping_neg() as $uD);
|
||||
(t.0 as $iD, (t.1 as $iD).wrapping_neg())
|
||||
},
|
||||
// Note that this is a full `$uD` multiplication being used here
|
||||
let mut rem = duo - (quo as $uD).wrapping_mul(div);
|
||||
if div <= rem {
|
||||
quo += 1;
|
||||
rem -= div;
|
||||
}
|
||||
return (quo as $uD, rem);
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,35 +1,30 @@
|
|||
/// Creates unsigned and signed division functions that use binary long division, designed for
|
||||
/// Creates an unsigned division function that uses binary long division, designed for
|
||||
/// computer architectures without division instructions. These functions have good performance for
|
||||
/// microarchitectures with large branch miss penalties and architectures without the ability to
|
||||
/// predicate instructions. For architectures with predicated instructions, one of the algorithms
|
||||
/// described in the documentation of these functions probably has higher performance, and a custom
|
||||
/// assembly routine should be used instead.
|
||||
#[doc(hidden)]
|
||||
#[macro_export]
|
||||
macro_rules! impl_binary_long {
|
||||
(
|
||||
$unsigned_name:ident, // name of the unsigned division function
|
||||
$signed_name:ident, // name of the signed division function
|
||||
$fn:ident, // name of the unsigned division function
|
||||
$zero_div_fn:ident, // function called when division by zero is attempted
|
||||
$normalization_shift:ident, // function for finding the normalization shift
|
||||
$n:tt, // the number of bits in a $iX or $uX
|
||||
$uX:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name`
|
||||
$iX:ident, // signed integer type for the inputs and outputs of `$signed_name`
|
||||
$($unsigned_attr:meta),*; // attributes for the unsigned function
|
||||
$($signed_attr:meta),* // attributes for the signed function
|
||||
$uX:ident, // unsigned integer type for the inputs and outputs of `$fn`
|
||||
$iX:ident // signed integer type with same bitwidth as `$uX`
|
||||
) => {
|
||||
/// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
|
||||
/// tuple.
|
||||
$(
|
||||
#[$unsigned_attr]
|
||||
)*
|
||||
pub fn $unsigned_name(duo: $uX, div: $uX) -> ($uX, $uX) {
|
||||
pub fn $fn(duo: $uX, div: $uX) -> ($uX, $uX) {
|
||||
let mut duo = duo;
|
||||
// handle edge cases before calling `$normalization_shift`
|
||||
if div == 0 {
|
||||
$zero_div_fn()
|
||||
}
|
||||
if duo < div {
|
||||
return (0, duo)
|
||||
return (0, duo);
|
||||
}
|
||||
|
||||
// There are many variations of binary division algorithm that could be used. This
|
||||
|
|
@ -430,7 +425,7 @@ macro_rules! impl_binary_long {
|
|||
let mut i = shl;
|
||||
loop {
|
||||
if i == 0 {
|
||||
break
|
||||
break;
|
||||
}
|
||||
i -= 1;
|
||||
// shift left 1 and subtract
|
||||
|
|
@ -550,47 +545,5 @@ macro_rules! impl_binary_long {
|
|||
return ((duo & mask) | quo, duo >> shl);
|
||||
*/
|
||||
}
|
||||
|
||||
/// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
|
||||
/// tuple.
|
||||
$(
|
||||
#[$signed_attr]
|
||||
)*
|
||||
pub fn $signed_name(duo: $iX, div: $iX) -> ($iX, $iX) {
|
||||
// There is a way of doing this without any branches, but requires too many extra
|
||||
// operations to be faster.
|
||||
/*
|
||||
let duo_s = duo >> ($n - 1);
|
||||
let div_s = div >> ($n - 1);
|
||||
let duo = (duo ^ duo_s).wrapping_sub(duo_s);
|
||||
let div = (div ^ div_s).wrapping_sub(div_s);
|
||||
let quo_s = duo_s ^ div_s;
|
||||
let rem_s = duo_s;
|
||||
let tmp = $unsigned_name(duo as $uX, div as $uX);
|
||||
(
|
||||
((tmp.0 as $iX) ^ quo_s).wrapping_sub(quo_s),
|
||||
((tmp.1 as $iX) ^ rem_s).wrapping_sub(rem_s),
|
||||
)
|
||||
*/
|
||||
|
||||
match (duo < 0, div < 0) {
|
||||
(false, false) => {
|
||||
let t = $unsigned_name(duo as $uX, div as $uX);
|
||||
(t.0 as $iX, t.1 as $iX)
|
||||
},
|
||||
(true, false) => {
|
||||
let t = $unsigned_name(duo.wrapping_neg() as $uX, div as $uX);
|
||||
((t.0 as $iX).wrapping_neg(), (t.1 as $iX).wrapping_neg())
|
||||
},
|
||||
(false, true) => {
|
||||
let t = $unsigned_name(duo as $uX, div.wrapping_neg() as $uX);
|
||||
((t.0 as $iX).wrapping_neg(), t.1 as $iX)
|
||||
},
|
||||
(true, true) => {
|
||||
let t = $unsigned_name(duo.wrapping_neg() as $uX, div.wrapping_neg() as $uX);
|
||||
(t.0 as $iX, (t.1 as $iX).wrapping_neg())
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,29 +1,24 @@
|
|||
/// Creates unsigned and signed division functions that use a combination of hardware division and
|
||||
/// Creates an unsigned division function that uses a combination of hardware division and
|
||||
/// binary long division to divide integers larger than what hardware division by itself can do. This
|
||||
/// function is intended for microarchitectures that have division hardware, but not fast enough
|
||||
/// multiplication hardware for `impl_trifecta` to be faster.
|
||||
#[doc(hidden)]
|
||||
#[macro_export]
|
||||
macro_rules! impl_delegate {
|
||||
(
|
||||
$unsigned_name:ident, // name of the unsigned division function
|
||||
$signed_name:ident, // name of the signed division function
|
||||
$fn:ident, // name of the unsigned division function
|
||||
$zero_div_fn:ident, // function called when division by zero is attempted
|
||||
$half_normalization_shift:ident, // function for finding the normalization shift of $uX
|
||||
$half_division:ident, // function for division of a $uX by a $uX
|
||||
$n_h:expr, // the number of bits in $iH or $uH
|
||||
$uH:ident, // unsigned integer with half the bit width of $uX
|
||||
$uX:ident, // unsigned integer with half the bit width of $uD.
|
||||
$uD:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name`
|
||||
$iD:ident, // signed integer type for the inputs and outputs of `$signed_name`
|
||||
$($unsigned_attr:meta),*; // attributes for the unsigned function
|
||||
$($signed_attr:meta),* // attributes for the signed function
|
||||
$uD:ident, // unsigned integer type for the inputs and outputs of `$fn`
|
||||
$iD:ident // signed integer type with the same bitwidth as `$uD`
|
||||
) => {
|
||||
/// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
|
||||
/// tuple.
|
||||
$(
|
||||
#[$unsigned_attr]
|
||||
)*
|
||||
pub fn $unsigned_name(duo: $uD, div: $uD) -> ($uD, $uD) {
|
||||
pub fn $fn(duo: $uD, div: $uD) -> ($uD, $uD) {
|
||||
// The two possibility algorithm, undersubtracting long division algorithm, or any kind
|
||||
// of reciprocal based algorithm will not be fastest, because they involve large
|
||||
// multiplications that we assume to not be fast enough relative to the divisions to
|
||||
|
|
@ -38,17 +33,15 @@ macro_rules! impl_delegate {
|
|||
let div_hi = (div >> n) as $uX;
|
||||
|
||||
match (div_lo == 0, div_hi == 0, duo_hi == 0) {
|
||||
(true, true, _) => {
|
||||
$zero_div_fn()
|
||||
}
|
||||
(true, true, _) => $zero_div_fn(),
|
||||
(_, false, true) => {
|
||||
// `duo` < `div`
|
||||
return (0, duo)
|
||||
return (0, duo);
|
||||
}
|
||||
(false, true, true) => {
|
||||
// delegate to smaller division
|
||||
let tmp = $half_division(duo_lo, div_lo);
|
||||
return (tmp.0 as $uD, tmp.1 as $uD)
|
||||
return (tmp.0 as $uD, tmp.1 as $uD);
|
||||
}
|
||||
(false, true, false) => {
|
||||
if duo_hi < div_lo {
|
||||
|
|
@ -96,7 +89,7 @@ macro_rules! impl_delegate {
|
|||
// Delegate to get the rest of the quotient. Note that the
|
||||
// `div_lo` here is the original unshifted `div`.
|
||||
let tmp = $half_division(duo as $uX, div_lo);
|
||||
return ((quo_lo | tmp.0) as $uD, tmp.1 as $uD)
|
||||
return ((quo_lo | tmp.0) as $uD, tmp.1 as $uD);
|
||||
}
|
||||
}
|
||||
div >>= 1;
|
||||
|
|
@ -105,7 +98,7 @@ macro_rules! impl_delegate {
|
|||
} else if duo_hi == div_lo {
|
||||
// `quo_hi == 1`. This branch is cheap and helps with edge cases.
|
||||
let tmp = $half_division(duo as $uX, div as $uX);
|
||||
return ((1 << n) | (tmp.0 as $uD), tmp.1 as $uD)
|
||||
return ((1 << n) | (tmp.0 as $uD), tmp.1 as $uD);
|
||||
} else {
|
||||
// `div_lo < duo_hi`
|
||||
// `rem_hi == 0`
|
||||
|
|
@ -114,22 +107,16 @@ macro_rules! impl_delegate {
|
|||
let div_0 = div_lo as $uH as $uX;
|
||||
let (quo_hi, rem_3) = $half_division(duo_hi, div_0);
|
||||
|
||||
let duo_mid =
|
||||
((duo >> $n_h) as $uH as $uX)
|
||||
| (rem_3 << $n_h);
|
||||
let duo_mid = ((duo >> $n_h) as $uH as $uX) | (rem_3 << $n_h);
|
||||
let (quo_1, rem_2) = $half_division(duo_mid, div_0);
|
||||
|
||||
let duo_lo =
|
||||
(duo as $uH as $uX)
|
||||
| (rem_2 << $n_h);
|
||||
let duo_lo = (duo as $uH as $uX) | (rem_2 << $n_h);
|
||||
let (quo_0, rem_1) = $half_division(duo_lo, div_0);
|
||||
|
||||
return (
|
||||
(quo_0 as $uD)
|
||||
| ((quo_1 as $uD) << $n_h)
|
||||
| ((quo_hi as $uD) << n),
|
||||
rem_1 as $uD
|
||||
)
|
||||
(quo_0 as $uD) | ((quo_1 as $uD) << $n_h) | ((quo_hi as $uD) << n),
|
||||
rem_1 as $uD,
|
||||
);
|
||||
}
|
||||
|
||||
// This is basically a short division composed of a half division for the hi
|
||||
|
|
@ -161,7 +148,7 @@ macro_rules! impl_delegate {
|
|||
let tmp = $half_division(duo as $uX, div_lo);
|
||||
return (
|
||||
(tmp.0) as $uD | (quo_lo as $uD) | ((quo_hi as $uD) << n),
|
||||
tmp.1 as $uD
|
||||
tmp.1 as $uD,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
@ -187,7 +174,7 @@ macro_rules! impl_delegate {
|
|||
duo = sub;
|
||||
quo_lo |= pow_lo;
|
||||
if duo < div_original {
|
||||
return (quo_lo as $uD, duo)
|
||||
return (quo_lo as $uD, duo);
|
||||
}
|
||||
}
|
||||
div >>= 1;
|
||||
|
|
@ -196,31 +183,5 @@ macro_rules! impl_delegate {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
|
||||
/// tuple.
|
||||
$(
|
||||
#[$signed_attr]
|
||||
)*
|
||||
pub fn $signed_name(duo: $iD, div: $iD) -> ($iD, $iD) {
|
||||
match (duo < 0, div < 0) {
|
||||
(false, false) => {
|
||||
let t = $unsigned_name(duo as $uD, div as $uD);
|
||||
(t.0 as $iD, t.1 as $iD)
|
||||
},
|
||||
(true, false) => {
|
||||
let t = $unsigned_name(duo.wrapping_neg() as $uD, div as $uD);
|
||||
((t.0 as $iD).wrapping_neg(), (t.1 as $iD).wrapping_neg())
|
||||
},
|
||||
(false, true) => {
|
||||
let t = $unsigned_name(duo as $uD, div.wrapping_neg() as $uD);
|
||||
((t.0 as $iD).wrapping_neg(), t.1 as $iD)
|
||||
},
|
||||
(true, true) => {
|
||||
let t = $unsigned_name(duo.wrapping_neg() as $uD, div.wrapping_neg() as $uD);
|
||||
(t.0 as $iD, (t.1 as $iD).wrapping_neg())
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -111,13 +111,6 @@ fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) {
|
|||
zero_div_fn()
|
||||
}
|
||||
|
||||
// `inline(never)` is placed on unsigned division functions so that there are just three division
|
||||
// functions (`u32_div_rem`, `u64_div_rem`, and `u128_div_rem`) backing all `compiler-builtins`
|
||||
// division functions. The signed functions like `i32_div_rem` will get inlined into the
|
||||
// `compiler-builtins` signed division functions, so that they directly call the three division
|
||||
// functions. Otherwise, LLVM may try to inline the unsigned division functions 4 times into the
|
||||
// signed division functions, which results in an explosion in code size.
|
||||
|
||||
// Whether `trifecta` or `delegate` is faster for 128 bit division depends on the speed at which a
|
||||
// microarchitecture can multiply and divide. We decide to be optimistic and assume `trifecta` is
|
||||
// faster if the target pointer width is at least 64.
|
||||
|
|
@ -127,16 +120,12 @@ fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) {
|
|||
))]
|
||||
impl_trifecta!(
|
||||
u128_div_rem,
|
||||
i128_div_rem,
|
||||
zero_div_fn,
|
||||
u64_by_u64_div_rem,
|
||||
32,
|
||||
u32,
|
||||
u64,
|
||||
u128,
|
||||
i128,
|
||||
inline(never);
|
||||
inline
|
||||
u128
|
||||
);
|
||||
|
||||
// If the pointer width less than 64, then the target architecture almost certainly does not have
|
||||
|
|
@ -147,7 +136,6 @@ impl_trifecta!(
|
|||
))]
|
||||
impl_delegate!(
|
||||
u128_div_rem,
|
||||
i128_div_rem,
|
||||
zero_div_fn,
|
||||
u64_normalization_shift,
|
||||
u64_by_u64_div_rem,
|
||||
|
|
@ -155,9 +143,7 @@ impl_delegate!(
|
|||
u32,
|
||||
u64,
|
||||
u128,
|
||||
i128,
|
||||
inline(never);
|
||||
inline
|
||||
i128
|
||||
);
|
||||
|
||||
/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
|
||||
|
|
@ -191,17 +177,13 @@ unsafe fn u128_by_u64_div_rem(duo: u128, div: u64) -> (u64, u64) {
|
|||
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
|
||||
impl_asymmetric!(
|
||||
u128_div_rem,
|
||||
i128_div_rem,
|
||||
zero_div_fn,
|
||||
u64_by_u64_div_rem,
|
||||
u128_by_u64_div_rem,
|
||||
32,
|
||||
u32,
|
||||
u64,
|
||||
u128,
|
||||
i128,
|
||||
inline(never);
|
||||
inline
|
||||
u128
|
||||
);
|
||||
|
||||
/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
|
||||
|
|
@ -226,7 +208,6 @@ fn u32_by_u32_div_rem(duo: u32, div: u32) -> (u32, u32) {
|
|||
))]
|
||||
impl_delegate!(
|
||||
u64_div_rem,
|
||||
i64_div_rem,
|
||||
zero_div_fn,
|
||||
u32_normalization_shift,
|
||||
u32_by_u32_div_rem,
|
||||
|
|
@ -234,9 +215,7 @@ impl_delegate!(
|
|||
u16,
|
||||
u32,
|
||||
u64,
|
||||
i64,
|
||||
inline(never);
|
||||
inline
|
||||
i64
|
||||
);
|
||||
|
||||
// When not on x86 and the pointer width is 64, use `binary_long`.
|
||||
|
|
@ -246,14 +225,11 @@ impl_delegate!(
|
|||
))]
|
||||
impl_binary_long!(
|
||||
u64_div_rem,
|
||||
i64_div_rem,
|
||||
zero_div_fn,
|
||||
u64_normalization_shift,
|
||||
64,
|
||||
u64,
|
||||
i64,
|
||||
inline(never);
|
||||
inline
|
||||
i64
|
||||
);
|
||||
|
||||
/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
|
||||
|
|
@ -287,28 +263,21 @@ unsafe fn u64_by_u32_div_rem(duo: u64, div: u32) -> (u32, u32) {
|
|||
#[cfg(all(feature = "asm", target_arch = "x86"))]
|
||||
impl_asymmetric!(
|
||||
u64_div_rem,
|
||||
i64_div_rem,
|
||||
zero_div_fn,
|
||||
u32_by_u32_div_rem,
|
||||
u64_by_u32_div_rem,
|
||||
16,
|
||||
u16,
|
||||
u32,
|
||||
u64,
|
||||
i64,
|
||||
inline(never);
|
||||
inline
|
||||
u64
|
||||
);
|
||||
|
||||
// 32 bits is the smallest division used by `compiler-builtins`, so we end with binary long division
|
||||
impl_binary_long!(
|
||||
u32_div_rem,
|
||||
i32_div_rem,
|
||||
zero_div_fn,
|
||||
u32_normalization_shift,
|
||||
32,
|
||||
u32,
|
||||
i32,
|
||||
inline(never);
|
||||
inline
|
||||
i32
|
||||
);
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
/// Creates a function used by some division algorithms to compute the "normalization shift".
|
||||
#[doc(hidden)]
|
||||
#[macro_export]
|
||||
macro_rules! impl_normalization_shift {
|
||||
(
|
||||
|
|
|
|||
|
|
@ -1,28 +1,22 @@
|
|||
/// Creates unsigned and signed division functions optimized for division of integers with bitwidths
|
||||
/// Creates an unsigned division function optimized for division of integers with bitwidths
|
||||
/// larger than the largest hardware integer division supported. These functions use large radix
|
||||
/// division algorithms that require both fast division and very fast widening multiplication on the
|
||||
/// target microarchitecture. Otherwise, `impl_delegate` should be used instead.
|
||||
#[doc(hidden)]
|
||||
#[macro_export]
|
||||
macro_rules! impl_trifecta {
|
||||
(
|
||||
$unsigned_name:ident, // name of the unsigned division function
|
||||
$signed_name:ident, // name of the signed division function
|
||||
$fn:ident, // name of the unsigned division function
|
||||
$zero_div_fn:ident, // function called when division by zero is attempted
|
||||
$half_division:ident, // function for division of a $uX by a $uX
|
||||
$n_h:expr, // the number of bits in $iH or $uH
|
||||
$uH:ident, // unsigned integer with half the bit width of $uX
|
||||
$uX:ident, // unsigned integer with half the bit width of $uD
|
||||
$uD:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name`
|
||||
$iD:ident, // signed integer type for the inputs and outputs of `$signed_name`
|
||||
$($unsigned_attr:meta),*; // attributes for the unsigned function
|
||||
$($signed_attr:meta),* // attributes for the signed function
|
||||
$uD:ident // unsigned integer type for the inputs and outputs of `$unsigned_name`
|
||||
) => {
|
||||
/// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
|
||||
/// tuple.
|
||||
$(
|
||||
#[$unsigned_attr]
|
||||
)*
|
||||
pub fn $unsigned_name(duo: $uD, div: $uD) -> ($uD, $uD) {
|
||||
pub fn $fn(duo: $uD, div: $uD) -> ($uD, $uD) {
|
||||
// This is called the trifecta algorithm because it uses three main algorithms: short
|
||||
// division for small divisors, the two possibility algorithm for large divisors, and an
|
||||
// undersubtracting long division algorithm for intermediate cases.
|
||||
|
|
@ -34,7 +28,9 @@ macro_rules! impl_trifecta {
|
|||
(tmp as $uX, (tmp >> ($n_h * 2)) as $uX)
|
||||
}
|
||||
fn carrying_mul_add(lhs: $uX, mul: $uX, add: $uX) -> ($uX, $uX) {
|
||||
let tmp = (lhs as $uD).wrapping_mul(mul as $uD).wrapping_add(add as $uD);
|
||||
let tmp = (lhs as $uD)
|
||||
.wrapping_mul(mul as $uD)
|
||||
.wrapping_add(add as $uD);
|
||||
(tmp as $uX, (tmp >> ($n_h * 2)) as $uX)
|
||||
}
|
||||
|
||||
|
|
@ -62,9 +58,9 @@ macro_rules! impl_trifecta {
|
|||
// The quotient cannot be more than 1. The highest set bit of `duo` needs to be at
|
||||
// least one place higher than `div` for the quotient to be more than 1.
|
||||
if duo >= div {
|
||||
return (1, duo - div)
|
||||
return (1, duo - div);
|
||||
} else {
|
||||
return (0, duo)
|
||||
return (0, duo);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -76,10 +72,7 @@ macro_rules! impl_trifecta {
|
|||
// `duo < 2^n` so it will fit in a $uX. `div` will also fit in a $uX (because of the
|
||||
// `div_lz <= duo_lz` branch) so no numerical error.
|
||||
let (quo, rem) = $half_division(duo as $uX, div as $uX);
|
||||
return (
|
||||
quo as $uD,
|
||||
rem as $uD
|
||||
)
|
||||
return (quo as $uD, rem as $uD);
|
||||
}
|
||||
|
||||
// `{2^n, 2^div_sb} <= duo < 2^n_d`
|
||||
|
|
@ -99,22 +92,16 @@ macro_rules! impl_trifecta {
|
|||
let div_0 = div as $uH as $uX;
|
||||
let (quo_hi, rem_3) = $half_division(duo_hi, div_0);
|
||||
|
||||
let duo_mid =
|
||||
((duo >> $n_h) as $uH as $uX)
|
||||
| (rem_3 << $n_h);
|
||||
let duo_mid = ((duo >> $n_h) as $uH as $uX) | (rem_3 << $n_h);
|
||||
let (quo_1, rem_2) = $half_division(duo_mid, div_0);
|
||||
|
||||
let duo_lo =
|
||||
(duo as $uH as $uX)
|
||||
| (rem_2 << $n_h);
|
||||
let duo_lo = (duo as $uH as $uX) | (rem_2 << $n_h);
|
||||
let (quo_0, rem_1) = $half_division(duo_lo, div_0);
|
||||
|
||||
return (
|
||||
(quo_0 as $uD)
|
||||
| ((quo_1 as $uD) << $n_h)
|
||||
| ((quo_hi as $uD) << n),
|
||||
rem_1 as $uD
|
||||
)
|
||||
(quo_0 as $uD) | ((quo_1 as $uD) << $n_h) | ((quo_hi as $uD) << n),
|
||||
rem_1 as $uD,
|
||||
);
|
||||
}
|
||||
|
||||
// relative leading significant bits, cannot overflow because of above branches
|
||||
|
|
@ -237,13 +224,10 @@ macro_rules! impl_trifecta {
|
|||
(quo - 1) as $uD,
|
||||
// Both the addition and subtraction can overflow, but when combined end up
|
||||
// as a correct positive number.
|
||||
duo.wrapping_add(div).wrapping_sub(tmp)
|
||||
)
|
||||
duo.wrapping_add(div).wrapping_sub(tmp),
|
||||
);
|
||||
} else {
|
||||
return (
|
||||
quo as $uD,
|
||||
duo - tmp
|
||||
)
|
||||
return (quo as $uD, duo - tmp);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -372,13 +356,10 @@ macro_rules! impl_trifecta {
|
|||
if duo < tmp {
|
||||
return (
|
||||
quo + ((quo_part - 1) as $uD),
|
||||
duo.wrapping_add(div).wrapping_sub(tmp)
|
||||
)
|
||||
duo.wrapping_add(div).wrapping_sub(tmp),
|
||||
);
|
||||
} else {
|
||||
return (
|
||||
quo + (quo_part as $uD),
|
||||
duo - tmp
|
||||
)
|
||||
return (quo + (quo_part as $uD), duo - tmp);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -387,15 +368,9 @@ macro_rules! impl_trifecta {
|
|||
if div_lz <= duo_lz {
|
||||
// quotient can have 0 or 1 added to it
|
||||
if div <= duo {
|
||||
return (
|
||||
quo + 1,
|
||||
duo - div
|
||||
)
|
||||
return (quo + 1, duo - div);
|
||||
} else {
|
||||
return (
|
||||
quo,
|
||||
duo
|
||||
)
|
||||
return (quo, duo);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -404,38 +379,9 @@ macro_rules! impl_trifecta {
|
|||
if n <= duo_lz {
|
||||
// simple division and addition
|
||||
let tmp = $half_division(duo as $uX, div as $uX);
|
||||
return (
|
||||
quo + (tmp.0 as $uD),
|
||||
tmp.1 as $uD
|
||||
)
|
||||
return (quo + (tmp.0 as $uD), tmp.1 as $uD);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
|
||||
/// tuple.
|
||||
$(
|
||||
#[$signed_attr]
|
||||
)*
|
||||
pub fn $signed_name(duo: $iD, div: $iD) -> ($iD, $iD) {
|
||||
match (duo < 0, div < 0) {
|
||||
(false, false) => {
|
||||
let t = $unsigned_name(duo as $uD, div as $uD);
|
||||
(t.0 as $iD, t.1 as $iD)
|
||||
},
|
||||
(true, false) => {
|
||||
let t = $unsigned_name(duo.wrapping_neg() as $uD, div as $uD);
|
||||
((t.0 as $iD).wrapping_neg(), (t.1 as $iD).wrapping_neg())
|
||||
},
|
||||
(false, true) => {
|
||||
let t = $unsigned_name(duo as $uD, div.wrapping_neg() as $uD);
|
||||
((t.0 as $iD).wrapping_neg(), t.1 as $iD)
|
||||
},
|
||||
(true, true) => {
|
||||
let t = $unsigned_name(duo.wrapping_neg() as $uD, div.wrapping_neg() as $uD);
|
||||
(t.0 as $iD, (t.1 as $iD).wrapping_neg())
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue