From 2f0685a9a2a72248b10cd70ca9d013c0ab9bf286 Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Sat, 8 Feb 2025 04:09:59 +0000
Subject: [PATCH] Implement `u256` with two `u128`s rather than `u64`

This produces better assembly, e.g. on aarch64:

            .globl  libm::u128_wmul
            .p2align        2
    libm::u128_wmul:
    Lfunc_begin124:
            .cfi_startproc
            mul x9, x2, x0
            umulh x10, x2, x0
            umulh x11, x3, x0
            mul x12, x3, x0
            umulh x13, x2, x1
            mul x14, x2, x1
            umulh x15, x3, x1
            mul x16, x3, x1
            adds x10, x10, x14
            cinc x13, x13, hs
            adds x13, x13, x16
            cinc x14, x15, hs
            adds x10, x10, x12
            cinc x11, x11, hs
            adds x11, x13, x11
            stp x9, x10, [x8]
            cinc x9, x14, hs
            stp x11, x9, [x8, rust-lang/libm#16]
            ret

The original was ~70 instructions so the improvement is significant.
With these changes, the result is reasonably close to what LLVM
generates using `u256` operands [1].

[1]: https://llvm.godbolt.org/z/re1aGdaqY
---
 .../libm/crates/libm-test/benches/icount.rs   |   6 +-
 .../libm/crates/libm-test/src/gen/random.rs   |   2 +-
 .../libm/crates/libm-test/src/lib.rs          |   5 +-
 .../libm/crates/libm-test/src/run_cfg.rs      |  30 ++-
 .../libm/crates/libm-test/tests/u256.rs       | 147 ++++++++++++
 .../libm/src/math/support/big.rs              | 211 ++++++------------
 .../libm/src/math/support/big/tests.rs        |  79 ++++---
 7 files changed, 295 insertions(+), 185 deletions(-)
 create mode 100644 library/compiler-builtins/libm/crates/libm-test/tests/u256.rs

diff --git a/library/compiler-builtins/libm/crates/libm-test/benches/icount.rs b/library/compiler-builtins/libm/crates/libm-test/benches/icount.rs
index 9fac52e0b852..be85dd5676c2 100644
--- a/library/compiler-builtins/libm/crates/libm-test/benches/icount.rs
+++ b/library/compiler-builtins/libm/crates/libm-test/benches/icount.rs
@@ -77,7 +77,6 @@ fn setup_u128_mul() -> Vec<(u128, u128)> {
     v
 }
 
-/*
 fn setup_u256_add() -> Vec<(u256, u256)> {
     let mut v = Vec::new();
     for (x, y) in setup_u128_mul() {
@@ -88,7 +87,6 @@ fn setup_u256_add() -> Vec<(u256, u256)> {
     v.push((u256::MAX, u256::MAX));
     v
 }
-*/
 
 fn setup_u256_shift() -> Vec<(u256, u32)> {
     let mut v = Vec::new();
@@ -116,7 +114,6 @@ library_benchmark_group!(
     benchmarks = icount_bench_u128_widen_mul
 );
 
-/* Not yet implemented
 #[library_benchmark]
 #[bench::linspace(setup_u256_add())]
 fn icount_bench_u256_add(cases: Vec<(u256, u256)>) {
@@ -129,7 +126,6 @@ library_benchmark_group!(
     name = icount_bench_u256_add_group;
     benchmarks = icount_bench_u256_add
 );
-*/
 
 #[library_benchmark]
 #[bench::linspace(setup_u256_shift())]
@@ -148,7 +144,7 @@ main!(
     library_benchmark_groups =
     // u256-related benchmarks
     icount_bench_u128_widen_mul_group,
-    // icount_bench_u256_add_group,
+    icount_bench_u256_add_group,
     icount_bench_u256_shr_group,
     // verify-apilist-start
     // verify-sorted-start
diff --git a/library/compiler-builtins/libm/crates/libm-test/src/gen/random.rs b/library/compiler-builtins/libm/crates/libm-test/src/gen/random.rs
index 5b127f38d09f..c2cd172d1ee4 100644
--- a/library/compiler-builtins/libm/crates/libm-test/src/gen/random.rs
+++ b/library/compiler-builtins/libm/crates/libm-test/src/gen/random.rs
@@ -14,7 +14,7 @@ use crate::run_cfg::{int_range, iteration_count};
 
 pub(crate) const SEED_ENV: &str = "LIBM_SEED";
 
-pub(crate) static SEED: LazyLock<[u8; 32]> = LazyLock::new(|| {
+pub static SEED: LazyLock<[u8; 32]> = LazyLock::new(|| {
     let s = env::var(SEED_ENV).unwrap_or_else(|_| {
         let mut rng = rand::thread_rng();
         (0..32).map(|_| rng.sample(Alphanumeric) as char).collect()
diff --git a/library/compiler-builtins/libm/crates/libm-test/src/lib.rs b/library/compiler-builtins/libm/crates/libm-test/src/lib.rs
index d2fef2325059..824f09a33873 100644
--- a/library/compiler-builtins/libm/crates/libm-test/src/lib.rs
+++ b/library/compiler-builtins/libm/crates/libm-test/src/lib.rs
@@ -29,7 +29,10 @@ pub use op::{
 };
 pub use precision::{MaybeOverride, SpecialCase, default_ulp};
 use run_cfg::extensive_max_iterations;
-pub use run_cfg::{CheckBasis, CheckCtx, EXTENSIVE_ENV, GeneratorKind, skip_extensive_test};
+pub use run_cfg::{
+    CheckBasis, CheckCtx, EXTENSIVE_ENV, GeneratorKind, bigint_fuzz_iteration_count,
+    skip_extensive_test,
+};
 pub use test_traits::{CheckOutput, Hex, TupleCall};
 
 /// Result type for tests is usually from `anyhow`. Most times there is no success value to
diff --git a/library/compiler-builtins/libm/crates/libm-test/src/run_cfg.rs b/library/compiler-builtins/libm/crates/libm-test/src/run_cfg.rs
index 4dd43bdf3868..6b268997666a 100644
--- a/library/compiler-builtins/libm/crates/libm-test/src/run_cfg.rs
+++ b/library/compiler-builtins/libm/crates/libm-test/src/run_cfg.rs
@@ -158,14 +158,6 @@ impl TestEnv {
         let op = id.math_op();
 
         let will_run_mp = cfg!(feature = "build-mpfr");
-
-        // Tests are pretty slow on non-64-bit targets, x86 MacOS, and targets that run in QEMU. Start
-        // with a reduced number on these platforms.
-        let slow_on_ci = crate::emulated()
-            || usize::BITS < 64
-            || cfg!(all(target_arch = "x86_64", target_vendor = "apple"));
-        let slow_platform = slow_on_ci && crate::ci();
-
         let large_float_ty = match op.float_ty {
             FloatTy::F16 | FloatTy::F32 => false,
             FloatTy::F64 | FloatTy::F128 => true,
@@ -176,7 +168,7 @@ impl TestEnv {
         let input_count = op.rust_sig.args.len();
 
         Self {
-            slow_platform,
+            slow_platform: slow_platform(),
             large_float_ty,
             should_run_extensive: will_run_extensive,
             mp_tests_enabled: will_run_mp,
@@ -185,6 +177,17 @@ impl TestEnv {
     }
 }
 
+/// Tests are pretty slow on non-64-bit targets, x86 MacOS, and targets that run in QEMU. Start
+/// with a reduced number on these platforms.
+fn slow_platform() -> bool {
+    let slow_on_ci = crate::emulated()
+        || usize::BITS < 64
+        || cfg!(all(target_arch = "x86_64", target_vendor = "apple"));
+
+    // If not running in CI, there is no need to reduce iteration count.
+    slow_on_ci && crate::ci()
+}
+
 /// The number of iterations to run for a given test.
 pub fn iteration_count(ctx: &CheckCtx, argnum: usize) -> u64 {
     let t_env = TestEnv::from_env(ctx);
@@ -351,3 +354,12 @@ pub fn skip_extensive_test(ctx: &CheckCtx) -> bool {
     let t_env = TestEnv::from_env(ctx);
     !t_env.should_run_extensive
 }
+
+/// The number of iterations to run for `u256` fuzz tests.
+pub fn bigint_fuzz_iteration_count() -> u64 {
+    if !cfg!(optimizations_enabled) {
+        return 1000;
+    }
+
+    if slow_platform() { 100_000 } else { 5_000_000 }
+}
diff --git a/library/compiler-builtins/libm/crates/libm-test/tests/u256.rs b/library/compiler-builtins/libm/crates/libm-test/tests/u256.rs
new file mode 100644
index 000000000000..4174820c05b4
--- /dev/null
+++ b/library/compiler-builtins/libm/crates/libm-test/tests/u256.rs
@@ -0,0 +1,147 @@
+//! Test the u256 implementation. the ops already get exercised reasonably well through the `f128`
+//! routines, so this only does a few million fuzz iterations against GMP.
+
+#![cfg(feature = "build-mpfr")]
+
+use std::sync::LazyLock;
+
+use libm::support::{HInt, u256};
+type BigInt = rug::Integer;
+
+use libm_test::bigint_fuzz_iteration_count;
+use libm_test::gen::random::SEED;
+use rand::{Rng, SeedableRng};
+use rand_chacha::ChaCha8Rng;
+use rug::Assign;
+use rug::integer::Order;
+use rug::ops::NotAssign;
+
+static BIGINT_U256_MAX: LazyLock<BigInt> =
+    LazyLock::new(|| BigInt::from_digits(&[u128::MAX, u128::MAX], Order::Lsf));
+
+/// Copied from the test module.
+fn hexu(v: u256) -> String {
+    format!("0x{:032x}{:032x}", v.hi, v.lo)
+}
+
+fn random_u256(rng: &mut ChaCha8Rng) -> u256 {
+    let lo: u128 = rng.gen();
+    let hi: u128 = rng.gen();
+    u256 { lo, hi }
+}
+
+fn assign_bigint(bx: &mut BigInt, x: u256) {
+    bx.assign_digits(&[x.lo, x.hi], Order::Lsf);
+}
+
+fn from_bigint(bx: &mut BigInt) -> u256 {
+    // Truncate so the result fits into `[u128; 2]`. This makes all ops overflowing.
+    *bx &= &*BIGINT_U256_MAX;
+    let mut bres = [0u128, 0];
+    bx.write_digits(&mut bres, Order::Lsf);
+    bx.assign(0);
+    u256 { lo: bres[0], hi: bres[1] }
+}
+
+fn check_one(
+    x: impl FnOnce() -> String,
+    y: impl FnOnce() -> Option<String>,
+    actual: u256,
+    expected: &mut BigInt,
+) {
+    let expected = from_bigint(expected);
+    if actual != expected {
+        let xmsg = x();
+        let ymsg = y().map(|y| format!("y:        {y}\n")).unwrap_or_default();
+        panic!(
+            "Results do not match\n\
+            input:    {xmsg}\n\
+            {ymsg}\
+            actual:   {}\n\
+            expected: {}\
+            ",
+            hexu(actual),
+            hexu(expected),
+        )
+    }
+}
+
+#[test]
+fn mp_u256_bitor() {
+    let mut rng = ChaCha8Rng::from_seed(*SEED);
+    let mut bx = BigInt::new();
+    let mut by = BigInt::new();
+
+    for _ in 0..bigint_fuzz_iteration_count() {
+        let x = random_u256(&mut rng);
+        let y = random_u256(&mut rng);
+        assign_bigint(&mut bx, x);
+        assign_bigint(&mut by, y);
+        let actual = x | y;
+        bx |= &by;
+        check_one(|| hexu(x), || Some(hexu(y)), actual, &mut bx);
+    }
+}
+
+#[test]
+fn mp_u256_not() {
+    let mut rng = ChaCha8Rng::from_seed(*SEED);
+    let mut bx = BigInt::new();
+
+    for _ in 0..bigint_fuzz_iteration_count() {
+        let x = random_u256(&mut rng);
+        assign_bigint(&mut bx, x);
+        let actual = !x;
+        bx.not_assign();
+        check_one(|| hexu(x), || None, actual, &mut bx);
+    }
+}
+
+#[test]
+fn mp_u256_add() {
+    let mut rng = ChaCha8Rng::from_seed(*SEED);
+    let mut bx = BigInt::new();
+    let mut by = BigInt::new();
+
+    for _ in 0..bigint_fuzz_iteration_count() {
+        let x = random_u256(&mut rng);
+        let y = random_u256(&mut rng);
+        assign_bigint(&mut bx, x);
+        assign_bigint(&mut by, y);
+        let actual = x + y;
+        bx += &by;
+        check_one(|| hexu(x), || Some(hexu(y)), actual, &mut bx);
+    }
+}
+
+#[test]
+fn mp_u256_shr() {
+    let mut rng = ChaCha8Rng::from_seed(*SEED);
+    let mut bx = BigInt::new();
+
+    for _ in 0..bigint_fuzz_iteration_count() {
+        let x = random_u256(&mut rng);
+        let shift: u32 = rng.gen_range(0..255);
+        assign_bigint(&mut bx, x);
+        let actual = x >> shift;
+        bx >>= shift;
+        check_one(|| hexu(x), || Some(shift.to_string()), actual, &mut bx);
+    }
+}
+
+#[test]
+fn mp_u256_widen_mul() {
+    let mut rng = ChaCha8Rng::from_seed(*SEED);
+    let mut bx = BigInt::new();
+    let mut by = BigInt::new();
+
+    for _ in 0..bigint_fuzz_iteration_count() {
+        let x: u128 = rng.gen();
+        let y: u128 = rng.gen();
+        bx.assign(x);
+        by.assign(y);
+        let actual = x.widen_mul(y);
+        bx *= &by;
+        check_one(|| format!("{x:#034x}"), || Some(format!("{y:#034x}")), actual, &mut bx);
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/support/big.rs b/library/compiler-builtins/libm/src/math/support/big.rs
index 017e9455e406..eae08238e099 100644
--- a/library/compiler-builtins/libm/src/math/support/big.rs
+++ b/library/compiler-builtins/libm/src/math/support/big.rs
@@ -7,40 +7,39 @@ use core::ops;
 
 use super::{DInt, HInt, Int, MinInt};
 
-const WORD_LO_MASK: u64 = 0x00000000ffffffff;
-const WORD_HI_MASK: u64 = 0xffffffff00000000;
-const WORD_FULL_MASK: u64 = 0xffffffffffffffff;
 const U128_LO_MASK: u128 = u64::MAX as u128;
 
-/// A 256-bit unsigned integer represented as 4 64-bit limbs.
-///
-/// Each limb is a native-endian number, but the array is little-limb-endian.
+/// A 256-bit unsigned integer represented as two 128-bit native-endian limbs.
 #[allow(non_camel_case_types)]
 #[derive(Clone, Copy, Debug, PartialEq, PartialOrd)]
-pub struct u256(pub [u64; 4]);
+pub struct u256 {
+    pub lo: u128,
+    pub hi: u128,
+}
 
 impl u256 {
-    #[allow(unused)]
-    pub const MAX: Self = Self([u64::MAX, u64::MAX, u64::MAX, u64::MAX]);
+    #[cfg(any(test, feature = "unstable-public-internals"))]
+    pub const MAX: Self = Self { lo: u128::MAX, hi: u128::MAX };
 
     /// Reinterpret as a signed integer
     pub fn signed(self) -> i256 {
-        i256(self.0)
+        i256 { lo: self.lo, hi: self.hi }
     }
 }
 
-/// A 256-bit signed integer represented as 4 64-bit limbs.
-///
-/// Each limb is a native-endian number, but the array is little-limb-endian.
+/// A 256-bit signed integer represented as two 128-bit native-endian limbs.
 #[allow(non_camel_case_types)]
 #[derive(Clone, Copy, Debug, PartialEq, PartialOrd)]
-pub struct i256(pub [u64; 4]);
+pub struct i256 {
+    pub lo: u128,
+    pub hi: u128,
+}
 
 impl i256 {
     /// Reinterpret as an unsigned integer
-    #[cfg(test)]
+    #[cfg(any(test, feature = "unstable-public-internals"))]
     pub fn unsigned(self) -> u256 {
-        u256(self.0)
+        u256 { lo: self.lo, hi: self.hi }
     }
 }
 
@@ -51,10 +50,10 @@ impl MinInt for u256 {
 
     const SIGNED: bool = false;
     const BITS: u32 = 256;
-    const ZERO: Self = Self([0u64; 4]);
-    const ONE: Self = Self([1, 0, 0, 0]);
-    const MIN: Self = Self([0u64; 4]);
-    const MAX: Self = Self([u64::MAX; 4]);
+    const ZERO: Self = Self { lo: 0, hi: 0 };
+    const ONE: Self = Self { lo: 1, hi: 0 };
+    const MIN: Self = Self { lo: 0, hi: 0 };
+    const MAX: Self = Self { lo: u128::MAX, hi: u128::MAX };
 }
 
 impl MinInt for i256 {
@@ -64,10 +63,10 @@ impl MinInt for i256 {
 
     const SIGNED: bool = false;
     const BITS: u32 = 256;
-    const ZERO: Self = Self([0u64; 4]);
-    const ONE: Self = Self([1, 0, 0, 0]);
-    const MIN: Self = Self([0, 0, 0, 1 << 63]);
-    const MAX: Self = Self([u64::MAX, u64::MAX, u64::MAX, u64::MAX << 1]);
+    const ZERO: Self = Self { lo: 0, hi: 0 };
+    const ONE: Self = Self { lo: 1, hi: 0 };
+    const MIN: Self = Self { lo: 0, hi: 1 << 127 };
+    const MAX: Self = Self { lo: u128::MAX, hi: u128::MAX << 1 };
 }
 
 macro_rules! impl_common {
@@ -76,10 +75,8 @@ macro_rules! impl_common {
             type Output = Self;
 
             fn bitor(mut self, rhs: Self) -> Self::Output {
-                self.0[0] |= rhs.0[0];
-                self.0[1] |= rhs.0[1];
-                self.0[2] |= rhs.0[2];
-                self.0[3] |= rhs.0[3];
+                self.lo |= rhs.lo;
+                self.hi |= rhs.hi;
                 self
             }
         }
@@ -87,8 +84,10 @@ macro_rules! impl_common {
         impl ops::Not for $ty {
             type Output = Self;
 
-            fn not(self) -> Self::Output {
-                Self([!self.0[0], !self.0[1], !self.0[2], !self.0[3]])
+            fn not(mut self) -> Self::Output {
+                self.lo = !self.lo;
+                self.hi = !self.hi;
+                self
             }
         }
 
@@ -105,10 +104,21 @@ macro_rules! impl_common {
 impl_common!(i256);
 impl_common!(u256);
 
+impl ops::Add<Self> for u256 {
+    type Output = Self;
+
+    fn add(self, rhs: Self) -> Self::Output {
+        let (lo, carry) = self.lo.overflowing_add(rhs.lo);
+        let hi = self.hi.wrapping_add(carry as u128).wrapping_add(rhs.hi);
+
+        Self { lo, hi }
+    }
+}
+
 impl ops::Shr<u32> for u256 {
     type Output = Self;
 
-    fn shr(self, rhs: u32) -> Self::Output {
+    fn shr(mut self, rhs: u32) -> Self::Output {
         debug_assert!(rhs < Self::BITS, "attempted to shift right with overflow");
         if rhs >= Self::BITS {
             return Self::ZERO;
@@ -118,57 +128,28 @@ impl ops::Shr<u32> for u256 {
             return self;
         }
 
-        let mut ret = self;
-        let byte_shift = rhs / 64;
-        let bit_shift = rhs % 64;
-
-        for idx in 0..4 {
-            let base_idx = idx + byte_shift as usize;
-
-            // FIXME(msrv): could be let...else.
-            let base = match ret.0.get(base_idx) {
-                Some(v) => v,
-                None => {
-                    ret.0[idx] = 0;
-                    continue;
-                }
-            };
-
-            let mut new_val = base >> bit_shift;
-
-            if let Some(new) = ret.0.get(base_idx + 1) {
-                new_val |= new.overflowing_shl(64 - bit_shift).0;
-            }
-
-            ret.0[idx] = new_val;
+        if rhs < 128 {
+            self.lo >>= rhs;
+            self.lo |= self.hi << (128 - rhs);
+        } else {
+            self.lo = self.hi >> (rhs - 128);
         }
 
-        ret
-    }
-}
+        if rhs < 128 {
+            self.hi >>= rhs;
+        } else {
+            self.hi = 0;
+        }
 
-macro_rules! word {
-    (1, $val:expr) => {
-        (($val >> (32 * 3)) & Self::from(WORD_LO_MASK)) as u64
-    };
-    (2, $val:expr) => {
-        (($val >> (32 * 2)) & Self::from(WORD_LO_MASK)) as u64
-    };
-    (3, $val:expr) => {
-        (($val >> (32 * 1)) & Self::from(WORD_LO_MASK)) as u64
-    };
-    (4, $val:expr) => {
-        (($val >> (32 * 0)) & Self::from(WORD_LO_MASK)) as u64
-    };
+        self
+    }
 }
 
 impl HInt for u128 {
     type D = u256;
 
     fn widen(self) -> Self::D {
-        let w0 = self & u128::from(u64::MAX);
-        let w1 = (self >> u64::BITS) & u128::from(u64::MAX);
-        u256([w0 as u64, w1 as u64, 0, 0])
+        u256 { lo: self, hi: 0 }
     }
 
     fn zero_widen(self) -> Self::D {
@@ -176,57 +157,24 @@ impl HInt for u128 {
     }
 
     fn zero_widen_mul(self, rhs: Self) -> Self::D {
-        let product11: u64 = word!(1, self) * word!(1, rhs);
-        let product12: u64 = word!(1, self) * word!(2, rhs);
-        let product13: u64 = word!(1, self) * word!(3, rhs);
-        let product14: u64 = word!(1, self) * word!(4, rhs);
-        let product21: u64 = word!(2, self) * word!(1, rhs);
-        let product22: u64 = word!(2, self) * word!(2, rhs);
-        let product23: u64 = word!(2, self) * word!(3, rhs);
-        let product24: u64 = word!(2, self) * word!(4, rhs);
-        let product31: u64 = word!(3, self) * word!(1, rhs);
-        let product32: u64 = word!(3, self) * word!(2, rhs);
-        let product33: u64 = word!(3, self) * word!(3, rhs);
-        let product34: u64 = word!(3, self) * word!(4, rhs);
-        let product41: u64 = word!(4, self) * word!(1, rhs);
-        let product42: u64 = word!(4, self) * word!(2, rhs);
-        let product43: u64 = word!(4, self) * word!(3, rhs);
-        let product44: u64 = word!(4, self) * word!(4, rhs);
+        let l0 = self & U128_LO_MASK;
+        let l1 = rhs & U128_LO_MASK;
+        let h0 = self >> 64;
+        let h1 = rhs >> 64;
 
-        let sum0: u128 = u128::from(product44);
-        let sum1: u128 = u128::from(product34) + u128::from(product43);
-        let sum2: u128 = u128::from(product24) + u128::from(product33) + u128::from(product42);
-        let sum3: u128 = u128::from(product14)
-            + u128::from(product23)
-            + u128::from(product32)
-            + u128::from(product41);
-        let sum4: u128 = u128::from(product13) + u128::from(product22) + u128::from(product31);
-        let sum5: u128 = u128::from(product12) + u128::from(product21);
-        let sum6: u128 = u128::from(product11);
+        let p_ll: u128 = l0.overflowing_mul(l1).0;
+        let p_lh: u128 = l0.overflowing_mul(h1).0;
+        let p_hl: u128 = h0.overflowing_mul(l1).0;
+        let p_hh: u128 = h0.overflowing_mul(h1).0;
 
-        let r0: u128 =
-            (sum0 & u128::from(WORD_FULL_MASK)) + ((sum1 & u128::from(WORD_LO_MASK)) << 32);
-        let r1: u128 = (sum0 >> 64)
-            + ((sum1 >> 32) & u128::from(WORD_FULL_MASK))
-            + (sum2 & u128::from(WORD_FULL_MASK))
-            + ((sum3 << 32) & u128::from(WORD_HI_MASK));
+        let s0 = p_hl + (p_ll >> 64);
+        let s1 = (p_ll & U128_LO_MASK) + (s0 << 64);
+        let s2 = p_lh + (s1 >> 64);
 
-        let (lo, carry) = r0.overflowing_add(r1 << 64);
-        let hi = (r1 >> 64)
-            + (sum1 >> 96)
-            + (sum2 >> 64)
-            + (sum3 >> 32)
-            + sum4
-            + (sum5 << 32)
-            + (sum6 << 64)
-            + u128::from(carry);
+        let lo = (p_ll & U128_LO_MASK) + (s2 << 64);
+        let hi = p_hh + (s0 >> 64) + (s2 >> 64);
 
-        u256([
-            (lo & U128_LO_MASK) as u64,
-            ((lo >> 64) & U128_LO_MASK) as u64,
-            (hi & U128_LO_MASK) as u64,
-            ((hi >> 64) & U128_LO_MASK) as u64,
-        ])
+        u256 { lo, hi }
     }
 
     fn widen_mul(self, rhs: Self) -> Self::D {
@@ -244,8 +192,7 @@ impl HInt for i128 {
     fn widen(self) -> Self::D {
         let mut ret = self.unsigned().zero_widen().signed();
         if self.is_negative() {
-            ret.0[2] = u64::MAX;
-            ret.0[3] = u64::MAX;
+            ret.hi = u128::MAX;
         }
         ret
     }
@@ -271,17 +218,11 @@ impl DInt for u256 {
     type H = u128;
 
     fn lo(self) -> Self::H {
-        let mut tmp = [0u8; 16];
-        tmp[..8].copy_from_slice(&self.0[0].to_le_bytes());
-        tmp[8..].copy_from_slice(&self.0[1].to_le_bytes());
-        u128::from_le_bytes(tmp)
+        self.lo
     }
 
     fn hi(self) -> Self::H {
-        let mut tmp = [0u8; 16];
-        tmp[..8].copy_from_slice(&self.0[2].to_le_bytes());
-        tmp[8..].copy_from_slice(&self.0[3].to_le_bytes());
-        u128::from_le_bytes(tmp)
+        self.hi
     }
 }
 
@@ -289,16 +230,10 @@ impl DInt for i256 {
     type H = i128;
 
     fn lo(self) -> Self::H {
-        let mut tmp = [0u8; 16];
-        tmp[..8].copy_from_slice(&self.0[0].to_le_bytes());
-        tmp[8..].copy_from_slice(&self.0[1].to_le_bytes());
-        i128::from_le_bytes(tmp)
+        self.lo as i128
     }
 
     fn hi(self) -> Self::H {
-        let mut tmp = [0u8; 16];
-        tmp[..8].copy_from_slice(&self.0[2].to_le_bytes());
-        tmp[8..].copy_from_slice(&self.0[3].to_le_bytes());
-        i128::from_le_bytes(tmp)
+        self.hi as i128
     }
 }
diff --git a/library/compiler-builtins/libm/src/math/support/big/tests.rs b/library/compiler-builtins/libm/src/math/support/big/tests.rs
index 815a62dfee84..6d06c700a5ee 100644
--- a/library/compiler-builtins/libm/src/math/support/big/tests.rs
+++ b/library/compiler-builtins/libm/src/math/support/big/tests.rs
@@ -9,33 +9,30 @@ const LOHI_SPLIT: u128 = 0xaaaaaaaaaaaaaaaaffffffffffffffff;
 
 /// Print a `u256` as hex since we can't add format implementations
 fn hexu(v: u256) -> String {
-    format!("0x{:016x}{:016x}{:016x}{:016x}", v.0[3], v.0[2], v.0[1], v.0[0])
+    format!("0x{:032x}{:032x}", v.hi, v.lo)
 }
 
 #[test]
 fn widen_u128() {
-    assert_eq!(u128::MAX.widen(), u256([u64::MAX, u64::MAX, 0, 0]));
-    assert_eq!(LOHI_SPLIT.widen(), u256([u64::MAX, 0xaaaaaaaaaaaaaaaa, 0, 0]));
+    assert_eq!(u128::MAX.widen(), u256 { lo: u128::MAX, hi: 0 });
+    assert_eq!(LOHI_SPLIT.widen(), u256 { lo: LOHI_SPLIT, hi: 0 });
 }
 
 #[test]
 fn widen_i128() {
     assert_eq!((-1i128).widen(), u256::MAX.signed());
-    assert_eq!(
-        (LOHI_SPLIT as i128).widen(),
-        i256([u64::MAX, 0xaaaaaaaaaaaaaaaa, u64::MAX, u64::MAX])
-    );
+    assert_eq!((LOHI_SPLIT as i128).widen(), i256 { lo: LOHI_SPLIT, hi: u128::MAX });
     assert_eq!((-1i128).zero_widen().unsigned(), (u128::MAX).widen());
 }
 
 #[test]
 fn widen_mul_u128() {
     let tests = [
-        (u128::MAX / 2, 2_u128, u256([u64::MAX - 1, u64::MAX, 0, 0])),
-        (u128::MAX, 2_u128, u256([u64::MAX - 1, u64::MAX, 1, 0])),
-        (u128::MAX, u128::MAX, u256([1, 0, u64::MAX - 1, u64::MAX])),
-        (u128::MIN, u128::MIN, u256::ZERO),
-        (1234, 0, u256::ZERO),
+        (u128::MAX / 2, 2_u128, u256 { lo: u128::MAX - 1, hi: 0 }),
+        (u128::MAX, 2_u128, u256 { lo: u128::MAX - 1, hi: 1 }),
+        (u128::MAX, u128::MAX, u256 { lo: 1, hi: u128::MAX - 1 }),
+        (0, 0, u256::ZERO),
+        (1234u128, 0, u256::ZERO),
         (0, 1234, u256::ZERO),
     ];
 
@@ -50,20 +47,27 @@ fn widen_mul_u128() {
     }
 
     for (i, a, b, exp, res) in &errors {
-        eprintln!("FAILURE ({i}): {a:#034x} * {b:#034x} = {} got {}", hexu(*exp), hexu(*res));
+        eprintln!(
+            "\
+            FAILURE ({i}): {a:#034x} * {b:#034x}\n\
+            expected: {}\n\
+            got:      {}\
+            ",
+            hexu(*exp),
+            hexu(*res)
+        );
     }
     assert!(errors.is_empty());
 }
 
 #[test]
-fn not_u128() {
+fn not_u256() {
     assert_eq!(!u256::ZERO, u256::MAX);
 }
 
 #[test]
-fn shr_u128() {
+fn shr_u256() {
     let only_low = [1, u16::MAX.into(), u32::MAX.into(), u64::MAX.into(), u128::MAX];
-
     let mut errors = Vec::new();
 
     for a in only_low {
@@ -80,20 +84,24 @@ fn shr_u128() {
     }
 
     let check = [
-        (u256::MAX, 1, u256([u64::MAX, u64::MAX, u64::MAX, u64::MAX >> 1])),
-        (u256::MAX, 5, u256([u64::MAX, u64::MAX, u64::MAX, u64::MAX >> 5])),
-        (u256::MAX, 63, u256([u64::MAX, u64::MAX, u64::MAX, 1])),
-        (u256::MAX, 64, u256([u64::MAX, u64::MAX, u64::MAX, 0])),
-        (u256::MAX, 65, u256([u64::MAX, u64::MAX, u64::MAX >> 1, 0])),
-        (u256::MAX, 127, u256([u64::MAX, u64::MAX, 1, 0])),
-        (u256::MAX, 128, u256([u64::MAX, u64::MAX, 0, 0])),
-        (u256::MAX, 129, u256([u64::MAX, u64::MAX >> 1, 0, 0])),
-        (u256::MAX, 191, u256([u64::MAX, 1, 0, 0])),
-        (u256::MAX, 192, u256([u64::MAX, 0, 0, 0])),
-        (u256::MAX, 193, u256([u64::MAX >> 1, 0, 0, 0])),
-        (u256::MAX, 191, u256([u64::MAX, 1, 0, 0])),
-        (u256::MAX, 254, u256([0b11, 0, 0, 0])),
-        (u256::MAX, 255, u256([1, 0, 0, 0])),
+        (u256::MAX, 1, u256 { lo: u128::MAX, hi: u128::MAX >> 1 }),
+        (u256::MAX, 5, u256 { lo: u128::MAX, hi: u128::MAX >> 5 }),
+        (u256::MAX, 63, u256 { lo: u128::MAX, hi: u64::MAX as u128 | (1 << 64) }),
+        (u256::MAX, 64, u256 { lo: u128::MAX, hi: u64::MAX as u128 }),
+        (u256::MAX, 65, u256 { lo: u128::MAX, hi: (u64::MAX >> 1) as u128 }),
+        (u256::MAX, 127, u256 { lo: u128::MAX, hi: 1 }),
+        (u256::MAX, 128, u256 { lo: u128::MAX, hi: 0 }),
+        (u256::MAX, 129, u256 { lo: u128::MAX >> 1, hi: 0 }),
+        (u256::MAX, 191, u256 { lo: u64::MAX as u128 | 1 << 64, hi: 0 }),
+        (u256::MAX, 192, u256 { lo: u64::MAX as u128, hi: 0 }),
+        (u256::MAX, 193, u256 { lo: u64::MAX as u128 >> 1, hi: 0 }),
+        (u256::MAX, 254, u256 { lo: 0b11, hi: 0 }),
+        (u256::MAX, 255, u256 { lo: 1, hi: 0 }),
+        (
+            u256 { hi: LOHI_SPLIT, lo: 0 },
+            64,
+            u256 { lo: 0xffffffffffffffff0000000000000000, hi: 0xaaaaaaaaaaaaaaaa },
+        ),
     ];
 
     for (input, shift, expected) in check {
@@ -104,7 +112,16 @@ fn shr_u128() {
     }
 
     for (a, b, res, expected) in &errors {
-        eprintln!("FAILURE: {} >> {b} = {} got {}", hexu(*a), hexu(*expected), hexu(*res),);
+        eprintln!(
+            "\
+            FAILURE:  {} >> {b}\n\
+            expected: {}\n\
+            got:      {}\
+            ",
+            hexu(*a),
+            hexu(*expected),
+            hexu(*res)
+        );
     }
     assert!(errors.is_empty());
 }