Merge compiler-builtins as a Josh subtree

Use the Josh [1] utility to add `compiler-builtins` as a subtree, which will allow us to stop using crates.io for updates. This is intended to help resolve some problems when unstable features change and require code changes in `compiler-builtins`, which sometimes gets trapped in a bootstrap cycle. This was done using `josh-filter` built from the r24.10.04 tag: git fetch https://github.com/rust-lang/compiler-builtins.git 233434412fe7eced8f1ddbfeddabef1d55e493bd josh-filter ":prefix=library/compiler-builtins" FETCH_HEAD git merge --allow-unrelated FILTERED_HEAD The HEAD in the `compiler-builtins` repository is 233434412f ("fix an if statement that can be collapsed"). [1]: https://github.com/josh-project/josh
2025-05-18 15:08:03 +00:00 · 2025-05-18 15:08:03 +00:00 · fcb3000340
commit fcb3000340
parent 7205fc537d abbf8fe6e7
380 changed files with 52998 additions and 0 deletions
--- a/library/compiler-builtins/.editorconfig
+++ b/library/compiler-builtins/.editorconfig
@ -0,0 +1,16 @@
+# EditorConfig helps developers define and maintain consistent
+# coding styles between different editors and IDEs
+# editorconfig.org
+
+root = true
+
+[*]
+end_of_line = lf
+charset = utf-8
+trim_trailing_whitespace = true
+insert_final_newline = true
+indent_style = space
+indent_size = 4
+
+[*.yml]
+indent_size = 2
--- a/library/compiler-builtins/.git-blame-ignore-revs
+++ b/library/compiler-builtins/.git-blame-ignore-revs
@ -0,0 +1,6 @@
+# Use `git config blame.ignorerevsfile .git-blame-ignore-revs` to make
+# `git blame` ignore the following commits.
+
+# Reformat with a new `.rustfmt.toml`
+# In rust-lang/libm this was 5882cabb83c30bf7c36023f9a55a80583636b0e8
+4bb07a6275cc628ef81c65ac971dc6479963322f
--- a/library/compiler-builtins/.github/workflows/main.yaml
+++ b/library/compiler-builtins/.github/workflows/main.yaml
@ -0,0 +1,344 @@
+name: CI
+on:
+  push: { branches: [master] }
+  pull_request:
+
+concurrency:
+  # Make sure that new pushes cancel running jobs
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+env:
+  CARGO_TERM_COLOR: always
+  RUSTDOCFLAGS: -Dwarnings
+  RUSTFLAGS: -Dwarnings
+  RUST_BACKTRACE: full
+  BENCHMARK_RUSTC: nightly-2025-01-16 # Pin the toolchain for reproducable results
+
+jobs:
+  # Determine which tests should be run based on changed files.
+  calculate_vars:
+    name: Calculate workflow variables
+    runs-on: ubuntu-24.04
+    timeout-minutes: 10
+    env:
+      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      PR_NUMBER: ${{ github.event.pull_request.number }}
+    outputs:
+      extensive_matrix: ${{ steps.script.outputs.extensive_matrix }}
+      may_skip_libm_ci: ${{ steps.script.outputs.may_skip_libm_ci }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 500
+      - name: Fetch pull request ref
+        run: git fetch origin "$GITHUB_REF:$GITHUB_REF"
+        if: github.event_name == 'pull_request'
+      - run: python3 ci/ci-util.py generate-matrix >> "$GITHUB_OUTPUT"
+        id: script
+
+  test:
+    name: Build and test
+    timeout-minutes: 60
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+        - target: aarch64-apple-darwin
+          os: macos-15
+        - target: aarch64-unknown-linux-gnu
+          os: ubuntu-24.04-arm
+        - target: aarch64-pc-windows-msvc
+          os: windows-2025
+          test_verbatim: 1
+          build_only: 1
+        - target: arm-unknown-linux-gnueabi
+          os: ubuntu-24.04
+        - target: arm-unknown-linux-gnueabihf
+          os: ubuntu-24.04
+        - target: armv7-unknown-linux-gnueabihf
+          os: ubuntu-24.04
+        - target: i586-unknown-linux-gnu
+          os: ubuntu-24.04
+        - target: i686-unknown-linux-gnu
+          os: ubuntu-24.04
+        - target: loongarch64-unknown-linux-gnu
+          os: ubuntu-24.04
+        - target: powerpc-unknown-linux-gnu
+          os: ubuntu-24.04
+        - target: powerpc64-unknown-linux-gnu
+          os: ubuntu-24.04
+        - target: powerpc64le-unknown-linux-gnu
+          os: ubuntu-24.04
+        - target: riscv64gc-unknown-linux-gnu
+          os: ubuntu-24.04
+        - target: thumbv6m-none-eabi
+          os: ubuntu-24.04
+        - target: thumbv7em-none-eabi
+          os: ubuntu-24.04
+        - target: thumbv7em-none-eabihf
+          os: ubuntu-24.04
+        - target: thumbv7m-none-eabi
+          os: ubuntu-24.04
+        - target: wasm32-unknown-unknown
+          os: ubuntu-24.04
+        - target: x86_64-unknown-linux-gnu
+          os: ubuntu-24.04
+        - target: x86_64-apple-darwin
+          os: macos-13
+        - target: i686-pc-windows-msvc
+          os: windows-2025
+          test_verbatim: 1
+        - target: x86_64-pc-windows-msvc
+          os: windows-2025
+          test_verbatim: 1
+        - target: i686-pc-windows-gnu
+          os: windows-2025
+          channel: nightly-i686-gnu
+        - target: x86_64-pc-windows-gnu
+          os: windows-2025
+          channel: nightly-x86_64-gnu
+    runs-on: ${{ matrix.os }}
+    needs: [calculate_vars]
+    env:
+      BUILD_ONLY: ${{ matrix.build_only }}
+      TEST_VERBATIM: ${{ matrix.test_verbatim }}
+      MAY_SKIP_LIBM_CI: ${{ needs.calculate_vars.outputs.may_skip_libm_ci }}
+    steps:
+    - name: Print runner information
+      run: uname -a
+    - uses: actions/checkout@v4
+      with:
+        submodules: true
+    - name: Install Rust (rustup)
+      shell: bash
+      run: |
+        channel="nightly"
+        # Account for channels that have required components (MinGW)
+        [ -n "${{ matrix.channel }}" ] && channel="${{ matrix.channel }}"
+        rustup update "$channel" --no-self-update
+        rustup default "$channel"
+        rustup target add "${{ matrix.target }}"
+        rustup component add llvm-tools-preview
+    - uses: taiki-e/install-action@nextest
+    - uses: Swatinem/rust-cache@v2
+      with:
+        key: ${{ matrix.target }}
+    - name: Cache Docker layers
+      uses: actions/cache@v4
+      if: matrix.os == 'ubuntu-24.04'
+      with:
+        path: /tmp/.buildx-cache
+        key: ${{ matrix.target }}-buildx-${{ github.sha }}
+        restore-keys: ${{ matrix.target }}-buildx-
+    # Configure buildx to use Docker layer caching
+    - uses: docker/setup-buildx-action@v3
+      if: matrix.os == 'ubuntu-24.04'
+
+    - name: Cache compiler-rt
+      id: cache-compiler-rt
+      uses: actions/cache@v4
+      with:
+        path: compiler-rt
+        key: ${{ runner.os }}-compiler-rt-${{ hashFiles('ci/download-compiler-rt.sh') }}
+    - name: Download compiler-rt reference sources
+      if: steps.cache-compiler-rt.outputs.cache-hit != 'true'
+      run: ./ci/download-compiler-rt.sh
+      shell: bash
+    - run: echo "RUST_COMPILER_RT_ROOT=$(realpath ./compiler-rt)" >> "$GITHUB_ENV"
+      shell: bash
+
+    - name: Verify API list
+      if: matrix.os == 'ubuntu-24.04'
+      run: python3 etc/update-api-list.py --check
+
+    # Non-linux tests just use our raw script
+    - name: Run locally
+      if: matrix.os != 'ubuntu-24.04'
+      shell: bash
+      run: ./ci/run.sh ${{ matrix.target }}
+
+    # Otherwise we use our docker containers to run builds
+    - name: Run in Docker
+      if: matrix.os == 'ubuntu-24.04'
+      run: ./ci/run-docker.sh ${{ matrix.target }}
+
+    - name: Print test logs if available
+      if: always()
+      run: if [ -f "target/test-log.txt" ]; then cat target/test-log.txt; fi
+      shell: bash
+
+    # Workaround to keep Docker cache smaller
+    # https://github.com/docker/build-push-action/issues/252
+    # https://github.com/moby/buildkit/issues/1896
+    - name: Move Docker cache
+      if: matrix.os == 'ubuntu-24.04'
+      run: |
+        rm -rf /tmp/.buildx-cache
+        mv /tmp/.buildx-cache-new /tmp/.buildx-cache
+
+  clippy:
+    name: Clippy
+    runs-on: ubuntu-24.04
+    timeout-minutes: 10
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        submodules: true
+    # Unlike rustfmt, stable clippy does not work on code with nightly features.
+    - name: Install nightly `clippy`
+      run: |
+        rustup set profile minimal
+        rustup default nightly
+        rustup component add clippy
+    - uses: Swatinem/rust-cache@v2
+    - run: cargo clippy --workspace --all-targets
+
+  benchmarks:
+    name: Benchmarks
+    runs-on: ubuntu-24.04
+    timeout-minutes: 20
+    steps:
+    - uses: actions/checkout@master
+      with:
+        submodules: true
+    - uses: taiki-e/install-action@cargo-binstall
+
+    - name: Set up dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y valgrind gdb libc6-dbg # Needed for iai-callgrind
+        rustup update "$BENCHMARK_RUSTC" --no-self-update
+        rustup default "$BENCHMARK_RUSTC"
+        # Install the version of iai-callgrind-runner that is specified in Cargo.toml
+        iai_version="$(cargo metadata --format-version=1 --features icount |
+           jq -r '.packages[] | select(.name == "iai-callgrind").version')"
+        cargo binstall -y iai-callgrind-runner --version "$iai_version"
+        sudo apt-get install valgrind
+    - uses: Swatinem/rust-cache@v2
+
+    - name: Run icount benchmarks
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        PR_NUMBER: ${{ github.event.pull_request.number }}
+      run: ./ci/bench-icount.sh
+
+    - name: Upload the benchmark baseline
+      uses: actions/upload-artifact@v4
+      with:
+        name: ${{ env.BASELINE_NAME }}
+        path: ${{ env.BASELINE_NAME }}.tar.xz
+    
+    - name: Run wall time benchmarks
+      run: |
+        # Always use the same seed for benchmarks. Ideally we should switch to a
+        # non-random generator.
+        export LIBM_SEED=benchesbenchesbenchesbencheswoo!
+        cargo bench --package libm-test \
+          --no-default-features \
+          --features short-benchmarks,build-musl,libm/force-soft-floats
+
+    - name: Print test logs if available
+      if: always()
+      run: if [ -f "target/test-log.txt" ]; then cat target/test-log.txt; fi
+      shell: bash
+
+  miri:
+    name: Miri
+    runs-on: ubuntu-24.04
+    timeout-minutes: 10
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        submodules: true
+    - name: Install Rust (rustup)
+      run: rustup update nightly --no-self-update && rustup default nightly
+      shell: bash
+    - run: rustup component add miri
+    - run: cargo miri setup
+    - uses: Swatinem/rust-cache@v2
+    - run: ./ci/miri.sh
+
+  msrv:
+    name: Check libm MSRV
+    runs-on: ubuntu-24.04
+    timeout-minutes: 10
+    env:
+      RUSTFLAGS: # No need to check warnings on old MSRV, unset `-Dwarnings`
+    steps:
+    - uses: actions/checkout@master
+    - name: Install Rust
+      run: |
+        msrv="$(perl -ne 'print if s/rust-version\s*=\s*"(.*)"/\1/g' libm/Cargo.toml)"
+        echo "MSRV: $msrv"
+        rustup update "$msrv" --no-self-update && rustup default "$msrv"
+    - uses: Swatinem/rust-cache@v2
+    - run: |
+        # FIXME(msrv): Remove the workspace Cargo.toml so 1.63 cargo doesn't see
+        # `edition = "2024"` and get spooked.
+        rm Cargo.toml
+        cargo build --manifest-path libm/Cargo.toml
+
+  rustfmt:
+    name: Rustfmt
+    runs-on: ubuntu-24.04
+    timeout-minutes: 10
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        submodules: true
+    - name: Install stable `rustfmt`
+      run: rustup set profile minimal && rustup default stable && rustup component add rustfmt
+    - run: cargo fmt -- --check
+
+  extensive:
+    name: Extensive tests for ${{ matrix.ty }}
+    needs:
+      # Wait on `clippy` so we have some confidence that the crate will build
+      - clippy
+      - calculate_vars
+    runs-on: ubuntu-24.04
+    timeout-minutes: 240 # 4 hours
+    strategy:
+      matrix:
+        # Use the output from `calculate_vars` to create the matrix
+        # FIXME: it would be better to run all jobs (i.e. all types) but mark those that
+        # didn't change as skipped, rather than completely excluding the job. However,
+        # this is not currently possible https://github.com/actions/runner/issues/1985.
+        include: ${{ fromJSON(needs.calculate_vars.outputs.extensive_matrix).extensive_matrix }}
+    env:
+      TO_TEST: ${{ matrix.to_test }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: true
+      - name: Install Rust
+        run: |
+          rustup update nightly --no-self-update
+          rustup default nightly
+      - uses: Swatinem/rust-cache@v2
+      - name: Run extensive tests
+        run: ./ci/run-extensive.sh
+      - name: Print test logs if available
+        run: if [ -f "target/test-log.txt" ]; then cat target/test-log.txt; fi
+        shell: bash
+
+  success:
+    needs:
+      - benchmarks
+      - clippy
+      - extensive
+      - miri
+      - msrv
+      - rustfmt
+      - test
+    runs-on: ubuntu-24.04
+    timeout-minutes: 10
+    # GitHub branch protection is exceedingly silly and treats "jobs skipped because a dependency
+    # failed" as success. So we have to do some contortions to ensure the job fails if any of its
+    # dependencies fails.
+    if: always() # make sure this is never "skipped"
+    steps:
+      # Manually check the status of all dependencies. `if: failure()` does not work.
+      - name: check if any dependency failed
+        run: jq --exit-status 'all(.result == "success")' <<< '${{ toJson(needs) }}'
--- a/library/compiler-builtins/.github/workflows/publish.yaml
+++ b/library/compiler-builtins/.github/workflows/publish.yaml
@ -0,0 +1,25 @@
+name: Release-plz
+
+permissions:
+  pull-requests: write
+  contents: write
+
+on:
+  push: { branches: [master] }
+
+jobs:
+  release-plz:
+    name: Release-plz
+    runs-on: ubuntu-24.04
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Install Rust (rustup)
+        run: rustup update nightly --no-self-update && rustup default nightly
+      - name: Run release-plz
+        uses: MarcoIeni/release-plz-action@v0.5
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
--- a/library/compiler-builtins/.gitignore
+++ b/library/compiler-builtins/.gitignore
@ -0,0 +1,16 @@
+# Rust files
+Cargo.lock
+target
+
+# Sources for external files
+compiler-rt
+*.tar.gz
+
+# Benchmark cache
+baseline-*
+iai-home
+
+# Temporary files
+*.bk
+*.rs.bk
+.#*
--- a/library/compiler-builtins/.gitmodules
+++ b/library/compiler-builtins/.gitmodules
@ -0,0 +1,4 @@
+[submodule "crates/musl-math-sys/musl"]
+	path = crates/musl-math-sys/musl
+	url = https://git.musl-libc.org/git/musl
+	shallow = true
--- a/library/compiler-builtins/.release-plz.toml
+++ b/library/compiler-builtins/.release-plz.toml
@ -0,0 +1,13 @@
+[workspace]
+# As part of the release process, we delete `libm/Cargo.toml`. Since
+# this is only run in CI, we shouldn't need to worry about it.
+allow_dirty = true
+publish_allow_dirty = true
+
+[[package]]
+name = "compiler_builtins"
+semver_check = false
+changelog_include = ["libm"] # libm is included as part of builtins
+
+[[package]]
+name = "libm"
--- a/library/compiler-builtins/.rustfmt.toml
+++ b/library/compiler-builtins/.rustfmt.toml
@ -0,0 +1,4 @@
+# This matches rustc
+style_edition = "2024"
+group_imports = "StdExternalCrate"
+imports_granularity = "Module"
--- a/library/compiler-builtins/CONTRIBUTING.md
+++ b/library/compiler-builtins/CONTRIBUTING.md
@ -0,0 +1,167 @@
+# How to contribute
+
+## compiler-builtins
+
+1. From the [pending list](compiler-builtins/README.md#progress), pick one or
+   more intrinsics.
+2. Port the version from [`compiler-rt`] and, if applicable, their
+   [tests][rt-tests]. Note that this crate has generic implementations for a lot
+   of routines, which may be usable without porting the entire implementation.
+3. Add a test to `builtins-test`, comparing the behavior of the ported
+   intrinsic(s) with their implementation on the testing host.
+4. Add the intrinsic to `builtins-test-intrinsics/src/main.rs` to verify it can
+   be linked on all targets.
+5. Send a Pull Request (PR) :tada:.
+
+[`compiler-rt`]: https://github.com/llvm/llvm-project/tree/b6820c35c59a4da3e59c11f657093ffbd79ae1db/compiler-rt/lib/builtins
+[rt-tests]: https://github.com/llvm/llvm-project/tree/b6820c35c59a4da3e59c11f657093ffbd79ae1db/compiler-rt/test/builtins
+
+## Porting Reminders
+
+1. [Rust][prec-rust] and [C][prec-c] have slightly different operator
+   precedence. C evaluates comparisons (`== !=`) before bitwise operations
+   (`& | ^`), while Rust evaluates the other way.
+2. C assumes wrapping operations everywhere. Rust panics on overflow when in
+   debug mode. Consider using the [Wrapping][wrap-ty] type or the explicit
+   [wrapping_*][wrap-fn] functions where applicable.
+3. Note [C implicit casts][casts], especially integer promotion. Rust is much
+   more explicit about casting, so be sure that any cast which affects the
+   output is ported to the Rust implementation.
+4. Rust has [many functions][i32] for integer or floating point manipulation in
+   the standard library. Consider using one of these functions rather than
+   porting a new one.
+
+[prec-rust]: https://doc.rust-lang.org/reference/expressions.html#expression-precedence
+[prec-c]: http://en.cppreference.com/w/c/language/operator_precedence
+[wrap-ty]: https://doc.rust-lang.org/core/num/struct.Wrapping.html
+[wrap-fn]: https://doc.rust-lang.org/std/primitive.i32.html#method.wrapping_add
+[casts]: http://en.cppreference.com/w/cpp/language/implicit_conversion
+[i32]: https://doc.rust-lang.org/std/primitive.i32.html
+
+## Tips and tricks
+
+- _IMPORTANT_ The code in this crate will end up being used in the `core` crate
+  so it can **not** have any external dependencies (other than a subset of
+  `core` itself).
+- Only use relative imports within the `math` directory / module, e.g.
+  `use self::fabs::fabs` or `use super::k_cos`. Absolute imports from core are
+  OK, e.g. `use core::u64`.
+- To reinterpret a float as an integer use the `to_bits` method. The MUSL code
+  uses the `GET_FLOAT_WORD` macro, or a union, to do this operation.
+- To reinterpret an integer as a float use the `f32::from_bits` constructor. The
+  MUSL code uses the `SET_FLOAT_WORD` macro, or a union, to do this operation.
+- You may use other methods from core like `f64::is_nan`, etc. as appropriate.
+- Rust does not have hex float literals. This crate provides two `hf16!`,
+  `hf32!`, `hf64!`, and `hf128!` which convert string literals to floats at
+  compile time.
+
+  ```rust
+  assert_eq!(hf32!("0x1.ffep+8").to_bits(), 0x43fff000);
+  assert_eq!(hf64!("0x1.ffep+8").to_bits(), 0x407ffe0000000000);
+  ```
+
+- Rust code panics on arithmetic overflows when not optimized. You may need to
+  use the [`Wrapping`] newtype to avoid this problem, or individual methods like
+  [`wrapping_add`].
+
+[`Wrapping`]: https://doc.rust-lang.org/std/num/struct.Wrapping.html
+[`wrapping_add`]: https://doc.rust-lang.org/std/primitive.u32.html#method.wrapping_add
+
+## Testing
+
+Testing for these crates can be somewhat complex, so feel free to rely on CI.
+
+The easiest way replicate CI testing is using Docker. This can be done by
+running `./ci/run-docker.sh [target]`. If no target is specified, all targets
+will be run.
+
+Tests can also be run without Docker:
+
+```sh
+# Run basic tests
+#
+# --no-default-features always needs to be passed, an unfortunate limitation
+# since the `#![compiler_builtins]` feature is enabled by default.
+cargo test --workspace --no-default-features
+
+# Test with all interesting features
+cargo test --workspace --no-default-features \
+    --features arch,unstable-float,unstable-intrinsics,mem
+
+# Run with more detailed tests for libm
+cargo test --workspace --no-default-features \
+    --features arch,unstable-float,unstable-intrinsics,mem \
+    --features build-mpfr,build-musl \
+    --profile release-checked
+```
+
+The multiprecision tests use the [`rug`] crate for bindings to MPFR. MPFR can be
+difficult to build on non-Unix systems, refer to [`gmp_mpfr_sys`] for help.
+
+`build-musl` does not build with MSVC, Wasm, or Thumb.
+
+[`rug`]: https://docs.rs/rug/latest/rug/
+[`gmp_mpfr_sys`]: https://docs.rs/gmp-mpfr-sys/1.6.4/gmp_mpfr_sys/
+
+In order to run all tests, some dependencies may be required:
+
+```sh
+# Allow testing compiler-builtins
+./ci/download-compiler-rt.sh
+
+# Optional, initialize musl for `--features build-musl`
+git submodule init
+git submodule update
+
+# `--release` ables more test cases
+cargo test --release
+```
+
+### Extensive tests
+
+Libm also has tests that are exhaustive (for single-argument `f32` and 1- or 2-
+argument `f16`) or extensive (for all other float and argument combinations).
+These take quite a long time to run, but are launched in CI when relevant files
+are changed.
+
+Exhaustive tests can be selected by passing an environment variable:
+
+```sh
+LIBM_EXTENSIVE_TESTS=sqrt,sqrtf cargo test --features build-mpfr \
+    --test z_extensive \
+    --profile release-checked
+
+# Run all tests for one type
+LIBM_EXTENSIVE_TESTS=all_f16 cargo test ...
+
+# Ensure `f64` tests can run exhaustively. Estimated completion test for a
+# single test is 57306 years on my machine so this may be worth skipping.
+LIBM_EXTENSIVE_TESTS=all LIBM_EXTENSIVE_ITERATIONS=18446744073709551615 cargo test ...
+```
+
+## Benchmarking
+
+Regular walltime benchmarks can be run with `cargo bench`:
+
+```sh
+cargo bench --no-default-features \
+    --features arch,unstable-float,unstable-intrinsics,mem \
+    --features benchmarking-reports
+```
+
+There are also benchmarks that check instruction count behind the `icount`
+feature. These require [`iai-callgrind-runner`] (via Cargo) and [Valgrind]
+to be installed, which means these only run on limited platforms.
+
+Instruction count benchmarks are run as part of CI to flag performance
+regresions.
+
+```sh
+cargo bench --no-default-features \
+    --features arch,unstable-float,unstable-intrinsics,mem \
+    --features icount \
+    --bench icount --bench mem_icount
+```
+
+[`iai-callgrind-runner`]: https://crates.io/crates/iai-callgrind-runner
+[Valgrind]: https://valgrind.org/
--- a/library/compiler-builtins/Cargo.toml
+++ b/library/compiler-builtins/Cargo.toml
@ -0,0 +1,50 @@
+[workspace]
+resolver = "2"
+members = [
+    "builtins-test",
+    "compiler-builtins",
+    "crates/libm-macros",
+    "crates/musl-math-sys",
+    "crates/panic-handler",
+    "crates/util",
+    "libm",
+    "libm-test",
+]
+
+default-members = [
+    "builtins-test",
+    "compiler-builtins",
+    "crates/libm-macros",
+    "libm",
+    "libm-test",
+]
+
+exclude = [
+    # `builtins-test-intrinsics` needs the feature `compiler-builtins` enabled
+    # and `mangled-names` disabled, which is the opposite of what is needed for
+    # other tests, so it makes sense to keep it out of the workspace.
+    "builtins-test-intrinsics",
+]
+
+[profile.release]
+panic = "abort"
+
+[profile.dev]
+panic = "abort"
+
+# Release mode with debug assertions
+[profile.release-checked]
+inherits = "release"
+debug-assertions = true
+overflow-checks = true
+
+# Release with maximum optimizations, which is very slow to build. This is also
+# what is needed to check `no-panic`.
+[profile.release-opt]
+inherits = "release"
+codegen-units = 1
+lto = "fat"
+
+[profile.bench]
+# Required for iai-callgrind
+debug = true
--- a/library/compiler-builtins/LICENSE.txt
+++ b/library/compiler-builtins/LICENSE.txt
@ -0,0 +1,275 @@
+The compiler-builtins crate is available for use under both the MIT license
+and the Apache-2.0 license with the LLVM exception (MIT AND Apache-2.0 WITH
+LLVM-exception).
+
+The libm crate is available for use under the MIT license.
+
+As a contributor, you agree that your code may be used under any of the
+following: the MIT license, the Apache-2.0 license, or the Apache-2.0 license
+with the LLVM exception. In other words, original (non-derivative) work is
+licensed under MIT OR Apache-2.0 OR Apache-2.0 WITH LLVM-exception. This is
+the default license for all other source in this repository.
+
+Text of the relevant licenses is provided below:
+
+------------------------------------------------------------------------------
+MIT License
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+    1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+    2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+    3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+    4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+    5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+    6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+    7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+    8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+    9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+    END OF TERMS AND CONDITIONS
+
+    APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+
+---- LLVM Exceptions to the Apache 2.0 License ----
+
+As an exception, if, as a result of your compiling your source code, portions
+of this Software are embedded into an Object form of such source code, you
+may redistribute such embedded portions in such Object form without complying
+with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
+
+In addition, if you combine or link compiled forms of this Software with
+software that is licensed under the GPLv2 ("Combined Software") and if a
+court of competent jurisdiction determines that the patent provision (Section
+3), the indemnity provision (Section 9) or other Section of the License
+conflicts with the conditions of the GPLv2, you may retroactively and
+prospectively choose to deem waived or otherwise exclude such Section(s) of
+the License, but only in their entirety and only with respect to the Combined
+Software.
+------------------------------------------------------------------------------
+
+Portions of this software are derived from third-party works licensed under
+terms compatible with the above Apache-2.0 WITH LLVM-exception AND MIT
+license:
+
+* compiler-builtins is derived from LLVM's compiler-rt (https://llvm.org/).
+  Work derived from compiler-rt prior to 2019-01-19 is usable under the MIT
+  license, with the following copyright:
+
+      Copyright (c) 2009-2016 by the contributors listed in CREDITS.TXT
+
+  The relevant CREDITS.TXT is located at
+  https://github.com/llvm/llvm-project/blob/main/compiler-rt/CREDITS.TXT.
+
+* Work derived from compiler-rt after 2019-01-19 is usable under the
+  Apache-2.0 license with the LLVM exception.
+
+* The bundled `math` module is from the libm crate, usable under the MIT
+  license. For further details and copyrights, see see libm/LICENSE.txt at
+  https://github.com/rust-lang/compiler-builtins.
+
+Additionally, some source files may contain comments with specific copyrights
+or licenses.
--- a/library/compiler-builtins/PUBLISHING.md
+++ b/library/compiler-builtins/PUBLISHING.md
@ -0,0 +1,16 @@
+# Publishing to crates.io
+
+Publishing `compiler-builtins` to crates.io takes a few steps unfortunately.
+It's not great, but it works for now. PRs to improve this process would be
+greatly appreciated!
+
+1. Make sure you've got a clean working tree and it's updated with the latest
+   changes on `master`
+2. Edit `Cargo.toml` to bump the version number
+3. Commit this change
+4. Run `git tag` to create a tag for this version
+5. Delete the `libm/Cargo.toml` file
+6. Run `cargo +nightly publish`
+7. Push the tag
+8. Push the commit
+9. Undo changes to `Cargo.toml` and the `libm` submodule
--- a/library/compiler-builtins/README.md
+++ b/library/compiler-builtins/README.md
@ -0,0 +1,27 @@
+# `compiler-builtins` and `libm`
+
+This repository contains two main crates:
+
+* `compiler-builtins`: symbols that the compiler expects to be available at
+  link time
+* `libm`: a Rust implementation of C math libraries, used to provide
+  implementations in `ocre`.
+
+More details are at [compiler-builtins/README.md](compiler-builtins/README.md)
+and [libm/README.md](libm/README.md).
+
+For instructions on contributing, see [CONTRIBUTING.md](CONTRIBUTING.md).
+
+## License
+
+* `libm` may be used under the [MIT License]
+* `compiler-builtins` may be used under the [MIT License] and the
+  [Apache License, Version 2.0] with the LLVM exception.
+* All original contributions must be under all of: the MIT license, the
+  Apache-2.0 license, and the Apache-2.0 license with the LLVM exception.
+
+More details are in [LICENSE.txt](LICENSE.txt) and
+[libm/LICENSE.txt](libm/LICENSE.txt).
+
+[MIT License]: https://opensource.org/license/mit
+[Apache License, Version 2.0]: htps://www.apache.org/licenses/LICENSE-2.0
--- a/library/compiler-builtins/builtins-test-intrinsics/Cargo.toml
+++ b/library/compiler-builtins/builtins-test-intrinsics/Cargo.toml
@ -0,0 +1,19 @@
+[package]
+name = "builtins-test-intrinsics"
+version = "0.1.0"
+edition = "2021"
+publish = false
+license = "MIT OR Apache-2.0"
+
+[dependencies]
+compiler_builtins = { path = "../compiler-builtins", features = ["compiler-builtins"]}
+panic-handler = { path = "../crates/panic-handler" }
+
+[features]
+c = ["compiler_builtins/c"]
+
+[profile.release]
+panic = "abort"
+
+[profile.dev]
+panic = "abort"
--- a/library/compiler-builtins/builtins-test-intrinsics/build.rs
+++ b/library/compiler-builtins/builtins-test-intrinsics/build.rs
@ -0,0 +1,11 @@
+mod builtins_configure {
+    include!("../compiler-builtins/configure.rs");
+}
+
+fn main() {
+    println!("cargo::rerun-if-changed=../configure.rs");
+
+    let target = builtins_configure::Target::from_env();
+    builtins_configure::configure_f16_f128(&target);
+    builtins_configure::configure_aliases(&target);
+}
--- a/library/compiler-builtins/builtins-test-intrinsics/src/main.rs
+++ b/library/compiler-builtins/builtins-test-intrinsics/src/main.rs
@ -0,0 +1,697 @@
+// By compiling this file we check that all the intrinsics we care about continue to be provided by
+// the `compiler_builtins` crate regardless of the changes we make to it. If we, by mistake, stop
+// compiling a C implementation and forget to implement that intrinsic in Rust, this file will fail
+// to link due to the missing intrinsic (symbol).
+
+#![allow(unused_features)]
+#![allow(internal_features)]
+#![deny(dead_code)]
+#![feature(allocator_api)]
+#![feature(f128)]
+#![feature(f16)]
+#![feature(lang_items)]
+#![no_std]
+#![no_main]
+
+extern crate panic_handler;
+
+#[cfg(all(not(thumb), not(windows), not(target_arch = "wasm32")))]
+#[link(name = "c")]
+extern "C" {}
+
+// Every function in this module maps will be lowered to an intrinsic by LLVM, if the platform
+// doesn't have native support for the operation used in the function. ARM has a naming convention
+// convention for its intrinsics that's different from other architectures; that's why some function
+// have an additional comment: the function name is the ARM name for the intrinsic and the comment
+// in the non-ARM name for the intrinsic.
+mod intrinsics {
+    /* f16 operations */
+
+    #[cfg(f16_enabled)]
+    pub fn extendhfsf(x: f16) -> f32 {
+        x as f32
+    }
+
+    #[cfg(f16_enabled)]
+    pub fn extendhfdf(x: f16) -> f64 {
+        x as f64
+    }
+
+    #[cfg(all(
+        f16_enabled,
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    pub fn extendhftf(x: f16) -> f128 {
+        x as f128
+    }
+
+    /* f32 operations */
+
+    #[cfg(f16_enabled)]
+    pub fn truncsfhf(x: f32) -> f16 {
+        x as f16
+    }
+
+    // extendsfdf2
+    pub fn aeabi_f2d(x: f32) -> f64 {
+        x as f64
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn extendsftf(x: f32) -> f128 {
+        x as f128
+    }
+
+    // fixsfsi
+    pub fn aeabi_f2iz(x: f32) -> i32 {
+        x as i32
+    }
+
+    // fixsfdi
+    pub fn aeabi_f2lz(x: f32) -> i64 {
+        x as i64
+    }
+
+    pub fn fixsfti(x: f32) -> i128 {
+        x as i128
+    }
+
+    // fixunssfsi
+    pub fn aeabi_f2uiz(x: f32) -> u32 {
+        x as u32
+    }
+
+    // fixunssfdi
+    pub fn aeabi_f2ulz(x: f32) -> u64 {
+        x as u64
+    }
+
+    pub fn fixunssfti(x: f32) -> u128 {
+        x as u128
+    }
+
+    // addsf3
+    pub fn aeabi_fadd(a: f32, b: f32) -> f32 {
+        a + b
+    }
+
+    // eqsf2
+    pub fn aeabi_fcmpeq(a: f32, b: f32) -> bool {
+        a == b
+    }
+
+    // gtsf2
+    pub fn aeabi_fcmpgt(a: f32, b: f32) -> bool {
+        a > b
+    }
+
+    // ltsf2
+    pub fn aeabi_fcmplt(a: f32, b: f32) -> bool {
+        a < b
+    }
+
+    // divsf3
+    pub fn aeabi_fdiv(a: f32, b: f32) -> f32 {
+        a / b
+    }
+
+    // mulsf3
+    pub fn aeabi_fmul(a: f32, b: f32) -> f32 {
+        a * b
+    }
+
+    // subsf3
+    pub fn aeabi_fsub(a: f32, b: f32) -> f32 {
+        a - b
+    }
+
+    /* f64 operations */
+
+    // truncdfsf2
+    pub fn aeabi_d2f(x: f64) -> f32 {
+        x as f32
+    }
+
+    // fixdfsi
+    pub fn aeabi_d2i(x: f64) -> i32 {
+        x as i32
+    }
+
+    // fixdfdi
+    pub fn aeabi_d2l(x: f64) -> i64 {
+        x as i64
+    }
+
+    pub fn fixdfti(x: f64) -> i128 {
+        x as i128
+    }
+
+    // fixunsdfsi
+    pub fn aeabi_d2uiz(x: f64) -> u32 {
+        x as u32
+    }
+
+    // fixunsdfdi
+    pub fn aeabi_d2ulz(x: f64) -> u64 {
+        x as u64
+    }
+
+    pub fn fixunsdfti(x: f64) -> u128 {
+        x as u128
+    }
+
+    // adddf3
+    pub fn aeabi_dadd(a: f64, b: f64) -> f64 {
+        a + b
+    }
+
+    // eqdf2
+    pub fn aeabi_dcmpeq(a: f64, b: f64) -> bool {
+        a == b
+    }
+
+    // gtdf2
+    pub fn aeabi_dcmpgt(a: f64, b: f64) -> bool {
+        a > b
+    }
+
+    // ltdf2
+    pub fn aeabi_dcmplt(a: f64, b: f64) -> bool {
+        a < b
+    }
+
+    // divdf3
+    pub fn aeabi_ddiv(a: f64, b: f64) -> f64 {
+        a / b
+    }
+
+    // muldf3
+    pub fn aeabi_dmul(a: f64, b: f64) -> f64 {
+        a * b
+    }
+
+    // subdf3
+    pub fn aeabi_dsub(a: f64, b: f64) -> f64 {
+        a - b
+    }
+
+    /* f128 operations */
+
+    #[cfg(all(
+        f16_enabled,
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    pub fn trunctfhf(x: f128) -> f16 {
+        x as f16
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn trunctfsf(x: f128) -> f32 {
+        x as f32
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn trunctfdf(x: f128) -> f64 {
+        x as f64
+    }
+
+    #[cfg(all(
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    pub fn fixtfsi(x: f128) -> i32 {
+        x as i32
+    }
+
+    #[cfg(all(
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    pub fn fixtfdi(x: f128) -> i64 {
+        x as i64
+    }
+
+    #[cfg(all(
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    pub fn fixtfti(x: f128) -> i128 {
+        x as i128
+    }
+
+    #[cfg(all(
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    pub fn fixunstfsi(x: f128) -> u32 {
+        x as u32
+    }
+
+    #[cfg(all(
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    pub fn fixunstfdi(x: f128) -> u64 {
+        x as u64
+    }
+
+    #[cfg(all(
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    pub fn fixunstfti(x: f128) -> u128 {
+        x as u128
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn addtf(a: f128, b: f128) -> f128 {
+        a + b
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn eqtf(a: f128, b: f128) -> bool {
+        a == b
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn gttf(a: f128, b: f128) -> bool {
+        a > b
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn lttf(a: f128, b: f128) -> bool {
+        a < b
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn multf(a: f128, b: f128) -> f128 {
+        a * b
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn divtf(a: f128, b: f128) -> f128 {
+        a / b
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn subtf(a: f128, b: f128) -> f128 {
+        a - b
+    }
+
+    /* i32 operations */
+
+    // floatsisf
+    pub fn aeabi_i2f(x: i32) -> f32 {
+        x as f32
+    }
+
+    // floatsidf
+    pub fn aeabi_i2d(x: i32) -> f64 {
+        x as f64
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn floatsitf(x: i32) -> f128 {
+        x as f128
+    }
+
+    pub fn aeabi_idiv(a: i32, b: i32) -> i32 {
+        a.wrapping_div(b)
+    }
+
+    pub fn aeabi_idivmod(a: i32, b: i32) -> i32 {
+        a % b
+    }
+
+    /* i64 operations */
+
+    // floatdisf
+    pub fn aeabi_l2f(x: i64) -> f32 {
+        x as f32
+    }
+
+    // floatdidf
+    pub fn aeabi_l2d(x: i64) -> f64 {
+        x as f64
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn floatditf(x: i64) -> f128 {
+        x as f128
+    }
+
+    pub fn mulodi4(a: i64, b: i64) -> i64 {
+        a * b
+    }
+
+    // divdi3
+    pub fn aeabi_ldivmod(a: i64, b: i64) -> i64 {
+        a / b
+    }
+
+    pub fn moddi3(a: i64, b: i64) -> i64 {
+        a % b
+    }
+
+    // muldi3
+    pub fn aeabi_lmul(a: i64, b: i64) -> i64 {
+        a.wrapping_mul(b)
+    }
+
+    /* i128 operations */
+
+    pub fn floattisf(x: i128) -> f32 {
+        x as f32
+    }
+
+    pub fn floattidf(x: i128) -> f64 {
+        x as f64
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn floattitf(x: i128) -> f128 {
+        x as f128
+    }
+
+    pub fn lshrti3(a: i128, b: usize) -> i128 {
+        a >> b
+    }
+
+    pub fn divti3(a: i128, b: i128) -> i128 {
+        a / b
+    }
+
+    pub fn modti3(a: i128, b: i128) -> i128 {
+        a % b
+    }
+
+    /* u32 operations */
+
+    // floatunsisf
+    pub fn aeabi_ui2f(x: u32) -> f32 {
+        x as f32
+    }
+
+    // floatunsidf
+    pub fn aeabi_ui2d(x: u32) -> f64 {
+        x as f64
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn floatunsitf(x: u32) -> f128 {
+        x as f128
+    }
+
+    pub fn aeabi_uidiv(a: u32, b: u32) -> u32 {
+        a / b
+    }
+
+    pub fn aeabi_uidivmod(a: u32, b: u32) -> u32 {
+        a % b
+    }
+
+    /* u64 operations */
+
+    // floatundisf
+    pub fn aeabi_ul2f(x: u64) -> f32 {
+        x as f32
+    }
+
+    // floatundidf
+    pub fn aeabi_ul2d(x: u64) -> f64 {
+        x as f64
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn floatunditf(x: u64) -> f128 {
+        x as f128
+    }
+
+    // udivdi3
+    pub fn aeabi_uldivmod(a: u64, b: u64) -> u64 {
+        a * b
+    }
+
+    pub fn umoddi3(a: u64, b: u64) -> u64 {
+        a % b
+    }
+
+    /* u128 operations */
+
+    pub fn floatuntisf(x: u128) -> f32 {
+        x as f32
+    }
+
+    pub fn floatuntidf(x: u128) -> f64 {
+        x as f64
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn floatuntitf(x: u128) -> f128 {
+        x as f128
+    }
+
+    pub fn muloti4(a: u128, b: u128) -> Option<u128> {
+        a.checked_mul(b)
+    }
+
+    pub fn multi3(a: u128, b: u128) -> u128 {
+        a.wrapping_mul(b)
+    }
+
+    pub fn ashlti3(a: u128, b: usize) -> u128 {
+        a >> b
+    }
+
+    pub fn ashrti3(a: u128, b: usize) -> u128 {
+        a << b
+    }
+
+    pub fn udivti3(a: u128, b: u128) -> u128 {
+        a / b
+    }
+
+    pub fn umodti3(a: u128, b: u128) -> u128 {
+        a % b
+    }
+}
+
+fn run() {
+    use core::hint::black_box as bb;
+
+    use intrinsics::*;
+
+    // FIXME(f16_f128): some PPC f128 <-> int conversion functions have the wrong names
+
+    #[cfg(f128_enabled)]
+    bb(addtf(bb(2.), bb(2.)));
+    bb(aeabi_d2f(bb(2.)));
+    bb(aeabi_d2i(bb(2.)));
+    bb(aeabi_d2l(bb(2.)));
+    bb(aeabi_d2uiz(bb(2.)));
+    bb(aeabi_d2ulz(bb(2.)));
+    bb(aeabi_dadd(bb(2.), bb(3.)));
+    bb(aeabi_dcmpeq(bb(2.), bb(3.)));
+    bb(aeabi_dcmpgt(bb(2.), bb(3.)));
+    bb(aeabi_dcmplt(bb(2.), bb(3.)));
+    bb(aeabi_ddiv(bb(2.), bb(3.)));
+    bb(aeabi_dmul(bb(2.), bb(3.)));
+    bb(aeabi_dsub(bb(2.), bb(3.)));
+    bb(aeabi_f2d(bb(2.)));
+    bb(aeabi_f2iz(bb(2.)));
+    bb(aeabi_f2lz(bb(2.)));
+    bb(aeabi_f2uiz(bb(2.)));
+    bb(aeabi_f2ulz(bb(2.)));
+    bb(aeabi_fadd(bb(2.), bb(3.)));
+    bb(aeabi_fcmpeq(bb(2.), bb(3.)));
+    bb(aeabi_fcmpgt(bb(2.), bb(3.)));
+    bb(aeabi_fcmplt(bb(2.), bb(3.)));
+    bb(aeabi_fdiv(bb(2.), bb(3.)));
+    bb(aeabi_fmul(bb(2.), bb(3.)));
+    bb(aeabi_fsub(bb(2.), bb(3.)));
+    bb(aeabi_i2d(bb(2)));
+    bb(aeabi_i2f(bb(2)));
+    bb(aeabi_idiv(bb(2), bb(3)));
+    bb(aeabi_idivmod(bb(2), bb(3)));
+    bb(aeabi_l2d(bb(2)));
+    bb(aeabi_l2f(bb(2)));
+    bb(aeabi_ldivmod(bb(2), bb(3)));
+    bb(aeabi_lmul(bb(2), bb(3)));
+    bb(aeabi_ui2d(bb(2)));
+    bb(aeabi_ui2f(bb(2)));
+    bb(aeabi_uidiv(bb(2), bb(3)));
+    bb(aeabi_uidivmod(bb(2), bb(3)));
+    bb(aeabi_ul2d(bb(2)));
+    bb(aeabi_ul2f(bb(2)));
+    bb(aeabi_uldivmod(bb(2), bb(3)));
+    bb(ashlti3(bb(2), bb(2)));
+    bb(ashrti3(bb(2), bb(2)));
+    #[cfg(f128_enabled)]
+    bb(divtf(bb(2.), bb(2.)));
+    bb(divti3(bb(2), bb(2)));
+    #[cfg(f128_enabled)]
+    bb(eqtf(bb(2.), bb(2.)));
+    #[cfg(f16_enabled)]
+    bb(extendhfdf(bb(2.)));
+    #[cfg(f16_enabled)]
+    bb(extendhfsf(bb(2.)));
+    #[cfg(all(
+        f16_enabled,
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    bb(extendhftf(bb(2.)));
+    #[cfg(f128_enabled)]
+    bb(extendsftf(bb(2.)));
+    bb(fixdfti(bb(2.)));
+    bb(fixsfti(bb(2.)));
+    #[cfg(all(
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    bb(fixtfdi(bb(2.)));
+    #[cfg(all(
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    bb(fixtfsi(bb(2.)));
+    #[cfg(all(
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    bb(fixtfti(bb(2.)));
+    bb(fixunsdfti(bb(2.)));
+    bb(fixunssfti(bb(2.)));
+    #[cfg(all(
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    bb(fixunstfdi(bb(2.)));
+    #[cfg(all(
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    bb(fixunstfsi(bb(2.)));
+    #[cfg(all(
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    bb(fixunstfti(bb(2.)));
+    #[cfg(f128_enabled)]
+    bb(floatditf(bb(2)));
+    #[cfg(f128_enabled)]
+    bb(floatsitf(bb(2)));
+    bb(floattidf(bb(2)));
+    bb(floattisf(bb(2)));
+    #[cfg(f128_enabled)]
+    bb(floattitf(bb(2)));
+    #[cfg(f128_enabled)]
+    bb(floatunditf(bb(2)));
+    #[cfg(f128_enabled)]
+    bb(floatunsitf(bb(2)));
+    bb(floatuntidf(bb(2)));
+    bb(floatuntisf(bb(2)));
+    #[cfg(f128_enabled)]
+    bb(floatuntitf(bb(2)));
+    #[cfg(f128_enabled)]
+    bb(gttf(bb(2.), bb(2.)));
+    bb(lshrti3(bb(2), bb(2)));
+    #[cfg(f128_enabled)]
+    bb(lttf(bb(2.), bb(2.)));
+    bb(moddi3(bb(2), bb(3)));
+    bb(modti3(bb(2), bb(2)));
+    bb(mulodi4(bb(2), bb(3)));
+    bb(muloti4(bb(2), bb(2)));
+    #[cfg(f128_enabled)]
+    bb(multf(bb(2.), bb(2.)));
+    bb(multi3(bb(2), bb(2)));
+    #[cfg(f128_enabled)]
+    bb(subtf(bb(2.), bb(2.)));
+    #[cfg(f16_enabled)]
+    bb(truncsfhf(bb(2.)));
+    #[cfg(f128_enabled)]
+    bb(trunctfdf(bb(2.)));
+    #[cfg(all(
+        f16_enabled,
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    bb(trunctfhf(bb(2.)));
+    #[cfg(f128_enabled)]
+    bb(trunctfsf(bb(2.)));
+    bb(udivti3(bb(2), bb(2)));
+    bb(umoddi3(bb(2), bb(3)));
+    bb(umodti3(bb(2), bb(2)));
+
+    something_with_a_dtor(&|| assert_eq!(bb(1), 1));
+
+    // FIXME(#802): This should be re-enabled once a workaround is found.
+    // extern "C" {
+    //     fn rust_begin_unwind(x: usize);
+    // }
+
+    // unsafe {
+    //     rust_begin_unwind(0);
+    // }
+}
+
+fn something_with_a_dtor(f: &dyn Fn()) {
+    struct A<'a>(&'a (dyn Fn() + 'a));
+
+    impl Drop for A<'_> {
+        fn drop(&mut self) {
+            (self.0)();
+        }
+    }
+    let _a = A(f);
+    f();
+}
+
+#[unsafe(no_mangle)]
+#[cfg(not(thumb))]
+fn main(_argc: core::ffi::c_int, _argv: *const *const u8) -> core::ffi::c_int {
+    run();
+    0
+}
+
+#[unsafe(no_mangle)]
+#[cfg(thumb)]
+pub fn _start() -> ! {
+    run();
+    loop {}
+}
+
+#[cfg(windows)]
+#[link(name = "kernel32")]
+#[link(name = "msvcrt")]
+extern "C" {}
+
+// ARM targets need these symbols
+#[unsafe(no_mangle)]
+pub fn __aeabi_unwind_cpp_pr0() {}
+
+#[unsafe(no_mangle)]
+pub fn __aeabi_unwind_cpp_pr1() {}
+
+#[cfg(not(any(windows, target_os = "cygwin")))]
+#[allow(non_snake_case)]
+#[unsafe(no_mangle)]
+pub fn _Unwind_Resume() {}
+
+#[cfg(not(any(windows, target_os = "cygwin")))]
+#[lang = "eh_personality"]
+pub extern "C" fn eh_personality() {}
+
+#[cfg(any(all(windows, target_env = "gnu"), target_os = "cygwin"))]
+mod mingw_unwinding {
+    #[unsafe(no_mangle)]
+    pub fn rust_eh_personality() {}
+    #[unsafe(no_mangle)]
+    pub fn rust_eh_unwind_resume() {}
+    #[unsafe(no_mangle)]
+    pub fn rust_eh_register_frames() {}
+    #[unsafe(no_mangle)]
+    pub fn rust_eh_unregister_frames() {}
+}
--- a/library/compiler-builtins/builtins-test/Cargo.toml
+++ b/library/compiler-builtins/builtins-test/Cargo.toml
@ -0,0 +1,99 @@
+[package]
+name = "builtins-test"
+version = "0.1.0"
+authors = ["Alex Crichton <alex@alexcrichton.com>"]
+edition = "2024"
+publish = false
+license = "MIT AND Apache-2.0 WITH LLVM-exception AND (MIT OR Apache-2.0)"
+
+[dependencies]
+# For fuzzing tests we want a deterministic seedable RNG. We also eliminate potential
+# problems with system RNGs on the variety of platforms this crate is tested on.
+# `xoshiro128**` is used for its quality, size, and speed at generating `u32` shift amounts.
+rand_xoshiro = "0.6"
+# To compare float builtins against
+rustc_apfloat = "0.2.1"
+# Really a dev dependency, but dev dependencies can't be optional
+iai-callgrind = { version = "0.14.0", optional = true }
+
+[dependencies.compiler_builtins]
+path = "../compiler-builtins"
+default-features = false
+features = ["unstable-public-internals"]
+
+[dev-dependencies]
+criterion = { version = "0.5.1", default-features = false, features = ["cargo_bench_support"] }
+paste = "1.0.15"
+
+[target.'cfg(all(target_arch = "arm", not(any(target_env = "gnu", target_env = "musl")), target_os = "linux"))'.dev-dependencies]
+test = { git = "https://github.com/japaric/utest" }
+utest-cortex-m-qemu = { default-features = false, git = "https://github.com/japaric/utest" }
+utest-macros = { git = "https://github.com/japaric/utest" }
+
+[features]
+default = ["mangled-names"]
+c = ["compiler_builtins/c"]
+no-asm = ["compiler_builtins/no-asm"]
+no-f16-f128 = ["compiler_builtins/no-f16-f128"]
+mem = ["compiler_builtins/mem"]
+mangled-names = ["compiler_builtins/mangled-names"]
+# Skip tests that rely on f128 symbols being available on the system
+no-sys-f128 = ["no-sys-f128-int-convert", "no-sys-f16-f128-convert"]
+# Some platforms have some f128 functions but everything except integer conversions
+no-sys-f128-int-convert = []
+no-sys-f16-f128-convert = []
+no-sys-f16-f64-convert = []
+# Skip tests that rely on f16 symbols being available on the system
+no-sys-f16 = ["no-sys-f16-f64-convert"]
+
+# Enable icount benchmarks (requires iai-callgrind and valgrind)
+icount = ["dep:iai-callgrind"]
+
+# Enable report generation without bringing in more dependencies by default
+benchmarking-reports = ["criterion/plotters", "criterion/html_reports"]
+
+# NOTE: benchmarks must be run with `--no-default-features` or with
+# `-p builtins-test`, otherwise the default `compiler-builtins` feature
+# of the `compiler_builtins` crate gets activated, resulting in linker
+# errors.
+
+[[bench]]
+name = "float_add"
+harness = false
+
+[[bench]]
+name = "float_sub"
+harness = false
+
+[[bench]]
+name = "float_mul"
+harness = false
+
+[[bench]]
+name = "float_div"
+harness = false
+
+[[bench]]
+name = "float_cmp"
+harness = false
+
+[[bench]]
+name = "float_conv"
+harness = false
+
+[[bench]]
+name = "float_extend"
+harness = false
+
+[[bench]]
+name = "float_trunc"
+harness = false
+
+[[bench]]
+name = "float_pow"
+harness = false
+
+[[bench]]
+name = "mem_icount"
+harness = false
+required-features = ["icount"]
--- a/library/compiler-builtins/builtins-test/benches/float_add.rs
+++ b/library/compiler-builtins/builtins-test/benches/float_add.rs
@ -0,0 +1,93 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::float_bench;
+use compiler_builtins::float::add;
+use criterion::{Criterion, criterion_main};
+
+float_bench! {
+    name: add_f32,
+    sig: (a: f32, b: f32) -> f32,
+    crate_fn: add::__addsf3,
+    sys_fn: __addsf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "addss {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fadd {a:s}, {a:s}, {b:s}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+float_bench! {
+    name: add_f64,
+    sig: (a: f64, b: f64) -> f64,
+    crate_fn: add::__adddf3,
+    sys_fn: __adddf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "addsd {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fadd {a:d}, {a:d}, {b:d}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: add_f128,
+    sig: (a: f128, b: f128) -> f128,
+    crate_fn: add::__addtf3,
+    crate_fn_ppc: add::__addkf3,
+    sys_fn: __addtf3,
+    sys_fn_ppc: __addkf3,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: []
+}
+
+pub fn float_add() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    add_f32(&mut criterion);
+    add_f64(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    {
+        add_f128(&mut criterion);
+    }
+}
+
+criterion_main!(float_add);
--- a/library/compiler-builtins/builtins-test/benches/float_cmp.rs
+++ b/library/compiler-builtins/builtins-test/benches/float_cmp.rs
@ -0,0 +1,207 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::float_bench;
+use compiler_builtins::float::cmp;
+use criterion::{Criterion, criterion_main};
+
+/// `gt` symbols are allowed to return differing results, they just get compared
+/// to 0.
+fn gt_res_eq(a: i32, b: i32) -> bool {
+    let a_lt_0 = a <= 0;
+    let b_lt_0 = b <= 0;
+    (a_lt_0 && b_lt_0) || (!a_lt_0 && !b_lt_0)
+}
+
+float_bench! {
+    name: cmp_f32_gt,
+    sig: (a: f32, b: f32) -> i32,
+    crate_fn: cmp::__gtsf2,
+    sys_fn: __gtsf2,
+    sys_available: all(),
+    output_eq: gt_res_eq,
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: i32;
+            asm!(
+                "xor     {ret:e}, {ret:e}",
+                "ucomiss {a}, {b}",
+                "seta    {ret:l}",
+                a = in(xmm_reg) a,
+                b = in(xmm_reg) b,
+                ret = out(reg) ret,
+                options(nomem, nostack, pure)
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i32;
+            asm!(
+                "fcmp    {a:s}, {b:s}",
+                "cset    {ret:w}, gt",
+                a = in(vreg) a,
+                b = in(vreg) b,
+                ret = out(reg) ret,
+                options(nomem,nostack),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: cmp_f32_unord,
+    sig: (a: f32, b: f32) -> i32,
+    crate_fn: cmp::__unordsf2,
+    sys_fn: __unordsf2,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: i32;
+            asm!(
+                "xor     {ret:e}, {ret:e}",
+                "ucomiss {a}, {b}",
+                "setp    {ret:l}",
+                a = in(xmm_reg) a,
+                b = in(xmm_reg) b,
+                ret = out(reg) ret,
+                options(nomem, nostack, pure)
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i32;
+            asm!(
+                "fcmp    {a:s}, {b:s}",
+                "cset    {ret:w}, vs",
+                a = in(vreg) a,
+                b = in(vreg) b,
+                ret = out(reg) ret,
+                options(nomem, nostack, pure)
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: cmp_f64_gt,
+    sig: (a: f64, b: f64) -> i32,
+    crate_fn: cmp::__gtdf2,
+    sys_fn: __gtdf2,
+    sys_available: all(),
+    output_eq: gt_res_eq,
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: i32;
+            asm!(
+                "xor     {ret:e}, {ret:e}",
+                "ucomisd {a}, {b}",
+                "seta    {ret:l}",
+                a = in(xmm_reg) a,
+                b = in(xmm_reg) b,
+                ret = out(reg) ret,
+                options(nomem, nostack, pure)
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i32;
+            asm!(
+                "fcmp    {a:d}, {b:d}",
+                "cset {ret:w}, gt",
+                a = in(vreg) a,
+                b = in(vreg) b,
+                ret = out(reg) ret,
+                options(nomem, nostack, pure)
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: cmp_f64_unord,
+    sig: (a: f64, b: f64) -> i32,
+    crate_fn: cmp::__unorddf2,
+    sys_fn: __unorddf2,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: i32;
+            asm!(
+                "xor     {ret:e}, {ret:e}",
+                "ucomisd {a}, {b}",
+                "setp    {ret:l}",
+                a = in(xmm_reg) a,
+                b = in(xmm_reg) b,
+                ret = out(reg) ret,
+                options(nomem, nostack, pure)
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i32;
+            asm!(
+                "fcmp    {a:d}, {b:d}",
+                "cset    {ret:w}, vs",
+                a = in(vreg) a,
+                b = in(vreg) b,
+                ret = out(reg) ret,
+                options(nomem, nostack, pure)
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: cmp_f128_gt,
+    sig: (a: f128, b: f128) -> i32,
+    crate_fn: cmp::__gttf2,
+    crate_fn_ppc: cmp::__gtkf2,
+    sys_fn: __gttf2,
+    sys_fn_ppc: __gtkf2,
+    sys_available: not(feature = "no-sys-f128"),
+    output_eq: gt_res_eq,
+    asm: []
+}
+
+float_bench! {
+    name: cmp_f128_unord,
+    sig: (a: f128, b: f128) -> i32,
+    crate_fn: cmp::__unordtf2,
+    crate_fn_ppc: cmp::__unordkf2,
+    sys_fn: __unordtf2,
+    sys_fn_ppc: __unordkf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: []
+}
+
+pub fn float_cmp() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    cmp_f32_gt(&mut criterion);
+    cmp_f32_unord(&mut criterion);
+    cmp_f64_gt(&mut criterion);
+    cmp_f64_unord(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    {
+        cmp_f128_gt(&mut criterion);
+        cmp_f128_unord(&mut criterion);
+    }
+}
+
+criterion_main!(float_cmp);
--- a/library/compiler-builtins/builtins-test/benches/float_conv.rs
+++ b/library/compiler-builtins/builtins-test/benches/float_conv.rs
@ -0,0 +1,688 @@
+#![allow(improper_ctypes)]
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::float_bench;
+use compiler_builtins::float::conv;
+use criterion::{Criterion, criterion_main};
+
+/* unsigned int -> float */
+
+float_bench! {
+    name: conv_u32_f32,
+    sig: (a: u32) -> f32,
+    crate_fn: conv::__floatunsisf,
+    sys_fn: __floatunsisf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: f32;
+            asm!(
+                "mov {tmp:e}, {a:e}",
+                "cvtsi2ss {ret}, {tmp}",
+                a = in(reg) a,
+                tmp = out(reg) _,
+                ret = lateout(xmm_reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f32;
+            asm!(
+                "ucvtf {ret:s}, {a:w}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_u32_f64,
+    sig: (a: u32) -> f64,
+    crate_fn: conv::__floatunsidf,
+    sys_fn: __floatunsidf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: f64;
+            asm!(
+                "mov {tmp:e}, {a:e}",
+                "cvtsi2sd {ret}, {tmp}",
+                a = in(reg) a,
+                tmp = out(reg) _,
+                ret = lateout(xmm_reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f64;
+            asm!(
+                "ucvtf {ret:d}, {a:w}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_u32_f128,
+    sig: (a: u32) -> f128,
+    crate_fn: conv::__floatunsitf,
+    crate_fn_ppc: conv::__floatunsikf,
+    sys_fn: __floatunsitf,
+    sys_fn_ppc: __floatunsikf,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+float_bench! {
+    name: conv_u64_f32,
+    sig: (a: u64) -> f32,
+    crate_fn: conv::__floatundisf,
+    sys_fn: __floatundisf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f32;
+            asm!(
+                "ucvtf {ret:s}, {a:x}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_u64_f64,
+    sig: (a: u64) -> f64,
+    crate_fn: conv::__floatundidf,
+    sys_fn: __floatundidf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f64;
+            asm!(
+                "ucvtf {ret:d}, {a:x}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_u64_f128,
+    sig: (a: u64) -> f128,
+    crate_fn: conv::__floatunditf,
+    crate_fn_ppc: conv::__floatundikf,
+    sys_fn: __floatunditf,
+    sys_fn_ppc: __floatundikf,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+float_bench! {
+    name: conv_u128_f32,
+    sig: (a: u128) -> f32,
+    crate_fn: conv::__floatuntisf,
+    sys_fn: __floatuntisf,
+    sys_available: all(),
+    asm: []
+}
+
+float_bench! {
+    name: conv_u128_f64,
+    sig: (a: u128) -> f64,
+    crate_fn: conv::__floatuntidf,
+    sys_fn: __floatuntidf,
+    sys_available: all(),
+    asm: []
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_u128_f128,
+    sig: (a: u128) -> f128,
+    crate_fn: conv::__floatuntitf,
+    crate_fn_ppc: conv::__floatuntikf,
+    sys_fn: __floatuntitf,
+    sys_fn_ppc: __floatuntikf,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+/* signed int -> float */
+
+float_bench! {
+    name: conv_i32_f32,
+    sig: (a: i32) -> f32,
+    crate_fn: conv::__floatsisf,
+    sys_fn: __floatsisf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: f32;
+            asm!(
+                "cvtsi2ss    {ret}, {a:e}",
+                a = in(reg) a,
+                ret = lateout(xmm_reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f32;
+            asm!(
+                "scvtf {ret:s}, {a:w}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_i32_f64,
+    sig: (a: i32) -> f64,
+    crate_fn: conv::__floatsidf,
+    sys_fn: __floatsidf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: f64;
+            asm!(
+                "cvtsi2sd    {ret}, {a:e}",
+                a = in(reg) a,
+                ret = lateout(xmm_reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f64;
+            asm!(
+                "scvtf {ret:d}, {a:w}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_i32_f128,
+    sig: (a: i32) -> f128,
+    crate_fn: conv::__floatsitf,
+    crate_fn_ppc: conv::__floatsikf,
+    sys_fn: __floatsitf,
+    sys_fn_ppc: __floatsikf,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+float_bench! {
+    name: conv_i64_f32,
+    sig: (a: i64) -> f32,
+    crate_fn: conv::__floatdisf,
+    sys_fn: __floatdisf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: f32;
+            asm!(
+                "cvtsi2ss    {ret}, {a:r}",
+                a = in(reg) a,
+                ret = lateout(xmm_reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f32;
+            asm!(
+                "scvtf {ret:s}, {a:x}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_i64_f64,
+    sig: (a: i64) -> f64,
+    crate_fn: conv::__floatdidf,
+    sys_fn: __floatdidf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: f64;
+            asm!(
+                "cvtsi2sd    {ret}, {a:r}",
+                a = in(reg) a,
+                ret = lateout(xmm_reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f64;
+            asm!(
+                "scvtf {ret:d}, {a:x}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_i64_f128,
+    sig: (a: i64) -> f128,
+    crate_fn: conv::__floatditf,
+    crate_fn_ppc: conv::__floatdikf,
+    sys_fn: __floatditf,
+    sys_fn_ppc: __floatdikf,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+float_bench! {
+    name: conv_i128_f32,
+    sig: (a: i128) -> f32,
+    crate_fn: conv::__floattisf,
+    sys_fn: __floattisf,
+    sys_available: all(),
+    asm: []
+}
+
+float_bench! {
+    name: conv_i128_f64,
+    sig: (a: i128) -> f64,
+    crate_fn: conv::__floattidf,
+    sys_fn: __floattidf,
+    sys_available: all(),
+    asm: []
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_i128_f128,
+    sig: (a: i128) -> f128,
+    crate_fn: conv::__floattitf,
+    crate_fn_ppc: conv::__floattikf,
+    sys_fn: __floattitf,
+    sys_fn_ppc: __floattikf,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+/* float -> unsigned int */
+
+#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+float_bench! {
+    name: conv_f32_u32,
+    sig: (a: f32) -> u32,
+    crate_fn: conv::__fixunssfsi,
+    sys_fn: __fixunssfsi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: u32;
+            asm!(
+                "fcvtzu {ret:w}, {a:s}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+float_bench! {
+    name: conv_f32_u64,
+    sig: (a: f32) -> u64,
+    crate_fn: conv::__fixunssfdi,
+    sys_fn: __fixunssfdi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: u64;
+            asm!(
+                "fcvtzu {ret:x}, {a:s}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+float_bench! {
+    name: conv_f32_u128,
+    sig: (a: f32) -> u128,
+    crate_fn: conv::__fixunssfti,
+    sys_fn: __fixunssfti,
+    sys_available: all(),
+    asm: []
+}
+
+float_bench! {
+    name: conv_f64_u32,
+    sig: (a: f64) -> u32,
+    crate_fn: conv::__fixunsdfsi,
+    sys_fn: __fixunsdfsi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: u32;
+            asm!(
+                "fcvtzu {ret:w}, {a:d}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_f64_u64,
+    sig: (a: f64) -> u64,
+    crate_fn: conv::__fixunsdfdi,
+    sys_fn: __fixunsdfdi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: u64;
+            asm!(
+                "fcvtzu {ret:x}, {a:d}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_f64_u128,
+    sig: (a: f64) -> u128,
+    crate_fn: conv::__fixunsdfti,
+    sys_fn: __fixunsdfti,
+    sys_available: all(),
+    asm: []
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_f128_u32,
+    sig: (a: f128) -> u32,
+    crate_fn: conv::__fixunstfsi,
+    crate_fn_ppc: conv::__fixunskfsi,
+    sys_fn: __fixunstfsi,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_f128_u64,
+    sig: (a: f128) -> u64,
+    crate_fn: conv::__fixunstfdi,
+    crate_fn_ppc: conv::__fixunskfdi,
+    sys_fn: __fixunstfdi,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_f128_u128,
+    sig: (a: f128) -> u128,
+    crate_fn: conv::__fixunstfti,
+    crate_fn_ppc: conv::__fixunskfti,
+    sys_fn: __fixunstfti,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+/* float -> signed int */
+
+#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+float_bench! {
+    name: conv_f32_i32,
+    sig: (a: f32) -> i32,
+    crate_fn: conv::__fixsfsi,
+    sys_fn: __fixsfsi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i32;
+            asm!(
+                "fcvtzs {ret:w}, {a:s}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+float_bench! {
+    name: conv_f32_i64,
+    sig: (a: f32) -> i64,
+    crate_fn: conv::__fixsfdi,
+    sys_fn: __fixsfdi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i64;
+            asm!(
+                "fcvtzs {ret:x}, {a:s}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+float_bench! {
+    name: conv_f32_i128,
+    sig: (a: f32) -> i128,
+    crate_fn: conv::__fixsfti,
+    sys_fn: __fixsfti,
+    sys_available: all(),
+    asm: []
+}
+
+float_bench! {
+    name: conv_f64_i32,
+    sig: (a: f64) -> i32,
+    crate_fn: conv::__fixdfsi,
+    sys_fn: __fixdfsi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i32;
+            asm!(
+                "fcvtzs {ret:w}, {a:d}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_f64_i64,
+    sig: (a: f64) -> i64,
+    crate_fn: conv::__fixdfdi,
+    sys_fn: __fixdfdi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i64;
+            asm!(
+                "fcvtzs {ret:x}, {a:d}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_f64_i128,
+    sig: (a: f64) -> i128,
+    crate_fn: conv::__fixdfti,
+    sys_fn: __fixdfti,
+    sys_available: all(),
+    asm: []
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_f128_i32,
+    sig: (a: f128) -> i32,
+    crate_fn: conv::__fixtfsi,
+    crate_fn_ppc: conv::__fixkfsi,
+    sys_fn: __fixtfsi,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_f128_i64,
+    sig: (a: f128) -> i64,
+    crate_fn: conv::__fixtfdi,
+    crate_fn_ppc: conv::__fixkfdi,
+    sys_fn: __fixtfdi,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_f128_i128,
+    sig: (a: f128) -> i128,
+    crate_fn: conv::__fixtfti,
+    crate_fn_ppc: conv::__fixkfti,
+    sys_fn: __fixtfti,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+pub fn float_conv() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    conv_u32_f32(&mut criterion);
+    conv_u32_f64(&mut criterion);
+    conv_u64_f32(&mut criterion);
+    conv_u64_f64(&mut criterion);
+    conv_u128_f32(&mut criterion);
+    conv_u128_f64(&mut criterion);
+    conv_i32_f32(&mut criterion);
+    conv_i32_f64(&mut criterion);
+    conv_i64_f32(&mut criterion);
+    conv_i64_f64(&mut criterion);
+    conv_i128_f32(&mut criterion);
+    conv_i128_f64(&mut criterion);
+    conv_f64_u32(&mut criterion);
+    conv_f64_u64(&mut criterion);
+    conv_f64_u128(&mut criterion);
+    conv_f64_i32(&mut criterion);
+    conv_f64_i64(&mut criterion);
+    conv_f64_i128(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    // FIXME: ppc64le has a sporadic overflow panic in the crate functions
+    // <https://github.com/rust-lang/compiler-builtins/issues/617#issuecomment-2125914639>
+    #[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+    {
+        conv_u32_f128(&mut criterion);
+        conv_u64_f128(&mut criterion);
+        conv_u128_f128(&mut criterion);
+        conv_i32_f128(&mut criterion);
+        conv_i64_f128(&mut criterion);
+        conv_i128_f128(&mut criterion);
+        conv_f128_u32(&mut criterion);
+        conv_f128_u64(&mut criterion);
+        conv_f128_u128(&mut criterion);
+        conv_f128_i32(&mut criterion);
+        conv_f128_i64(&mut criterion);
+        conv_f128_i128(&mut criterion);
+    }
+}
+
+criterion_main!(float_conv);
--- a/library/compiler-builtins/builtins-test/benches/float_div.rs
+++ b/library/compiler-builtins/builtins-test/benches/float_div.rs
@ -0,0 +1,93 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::float_bench;
+use compiler_builtins::float::div;
+use criterion::{Criterion, criterion_main};
+
+float_bench! {
+    name: div_f32,
+    sig: (a: f32, b: f32) -> f32,
+    crate_fn: div::__divsf3,
+    sys_fn: __divsf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "divss {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fdiv {a:s}, {a:s}, {b:s}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+float_bench! {
+    name: div_f64,
+    sig: (a: f64, b: f64) -> f64,
+    crate_fn: div::__divdf3,
+    sys_fn: __divdf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "divsd {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fdiv {a:d}, {a:d}, {b:d}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: div_f128,
+    sig: (a: f128, b: f128) -> f128,
+    crate_fn: div::__divtf3,
+    crate_fn_ppc: div::__divkf3,
+    sys_fn: __divtf3,
+    sys_fn_ppc: __divkf3,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: []
+}
+
+pub fn float_div() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    div_f32(&mut criterion);
+    div_f64(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    {
+        div_f128(&mut criterion);
+    }
+}
+
+criterion_main!(float_div);
--- a/library/compiler-builtins/builtins-test/benches/float_extend.rs
+++ b/library/compiler-builtins/builtins-test/benches/float_extend.rs
@ -0,0 +1,133 @@
+#![allow(unused_variables)] // "unused" f16 registers
+#![cfg_attr(f128_enabled, feature(f128))]
+#![cfg_attr(f16_enabled, feature(f16))]
+
+use builtins_test::float_bench;
+use compiler_builtins::float::extend;
+use criterion::{Criterion, criterion_main};
+
+#[cfg(f16_enabled)]
+float_bench! {
+    name: extend_f16_f32,
+    sig: (a: f16) -> f32,
+    crate_fn: extend::__extendhfsf2,
+    sys_fn: __extendhfsf2,
+    sys_available: not(feature = "no-sys-f16"),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f32;
+            asm!(
+                "fcvt    {ret:s}, {a:h}",
+                a = in(vreg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(f16_enabled)]
+float_bench! {
+    name: extend_f16_f64,
+    sig: (a: f16) -> f64,
+    crate_fn: extend::__extendhfdf2,
+    sys_fn: __extendhfdf2,
+    sys_available: not(feature = "no-sys-f16-f64-convert"),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f64;
+            asm!(
+                "fcvt    {ret:d}, {a:h}",
+                a = in(vreg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(all(f16_enabled, f128_enabled))]
+float_bench! {
+    name: extend_f16_f128,
+    sig: (a: f16) -> f128,
+    crate_fn: extend::__extendhftf2,
+    crate_fn_ppc: extend::__extendhfkf2,
+    sys_fn: __extendhftf2,
+    sys_fn_ppc: __extendhfkf2,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: [],
+}
+
+float_bench! {
+    name: extend_f32_f64,
+    sig: (a: f32) -> f64,
+    crate_fn: extend::__extendsfdf2,
+    sys_fn: __extendsfdf2,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f64;
+            asm!(
+                "fcvt    {ret:d}, {a:s}",
+                a = in(vreg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: extend_f32_f128,
+    sig: (a: f32) -> f128,
+    crate_fn: extend::__extendsftf2,
+    crate_fn_ppc: extend::__extendsfkf2,
+    sys_fn: __extendsftf2,
+    sys_fn_ppc: __extendsfkf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: [],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: extend_f64_f128,
+    sig: (a: f64) -> f128,
+    crate_fn: extend::__extenddftf2,
+    crate_fn_ppc: extend::__extenddfkf2,
+    sys_fn: __extenddftf2,
+    sys_fn_ppc: __extenddfkf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: [],
+}
+
+pub fn float_extend() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    // FIXME(#655): `f16` tests disabled until we can bootstrap symbols
+    #[cfg(f16_enabled)]
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    {
+        extend_f16_f32(&mut criterion);
+        extend_f16_f64(&mut criterion);
+
+        #[cfg(f128_enabled)]
+        extend_f16_f128(&mut criterion);
+    }
+
+    extend_f32_f64(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    {
+        extend_f32_f128(&mut criterion);
+        extend_f64_f128(&mut criterion);
+    }
+}
+
+criterion_main!(float_extend);
--- a/library/compiler-builtins/builtins-test/benches/float_mul.rs
+++ b/library/compiler-builtins/builtins-test/benches/float_mul.rs
@ -0,0 +1,93 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::float_bench;
+use compiler_builtins::float::mul;
+use criterion::{Criterion, criterion_main};
+
+float_bench! {
+    name: mul_f32,
+    sig: (a: f32, b: f32) -> f32,
+    crate_fn: mul::__mulsf3,
+    sys_fn: __mulsf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "mulss {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fmul {a:s}, {a:s}, {b:s}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+float_bench! {
+    name: mul_f64,
+    sig: (a: f64, b: f64) -> f64,
+    crate_fn: mul::__muldf3,
+    sys_fn: __muldf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "mulsd {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fmul {a:d}, {a:d}, {b:d}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: mul_f128,
+    sig: (a: f128, b: f128) -> f128,
+    crate_fn: mul::__multf3,
+    crate_fn_ppc: mul::__mulkf3,
+    sys_fn: __multf3,
+    sys_fn_ppc: __mulkf3,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: []
+}
+
+pub fn float_mul() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    mul_f32(&mut criterion);
+    mul_f64(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    {
+        mul_f128(&mut criterion);
+    }
+}
+
+criterion_main!(float_mul);
--- a/library/compiler-builtins/builtins-test/benches/float_pow.rs
+++ b/library/compiler-builtins/builtins-test/benches/float_pow.rs
@ -0,0 +1,49 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::float_bench;
+use compiler_builtins::float::pow;
+use criterion::{Criterion, criterion_main};
+
+float_bench! {
+    name: powi_f32,
+    sig: (a: f32, b: i32) -> f32,
+    crate_fn: pow::__powisf2,
+    sys_fn: __powisf2,
+    sys_available: all(),
+    asm: [],
+}
+
+float_bench! {
+    name: powi_f64,
+    sig: (a: f64, b: i32) -> f64,
+    crate_fn: pow::__powidf2,
+    sys_fn: __powidf2,
+    sys_available: all(),
+    asm: [],
+}
+
+// FIXME(f16_f128): can be changed to only `f128_enabled` once `__multf3` and `__divtf3` are
+// distributed by nightly.
+#[cfg(all(f128_enabled, not(feature = "no-sys-f128")))]
+float_bench! {
+    name: powi_f128,
+    sig: (a: f128, b: i32) -> f128,
+    crate_fn: pow::__powitf2,
+    crate_fn_ppc: pow::__powikf2,
+    sys_fn: __powitf2,
+    sys_fn_ppc: __powikf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: []
+}
+
+pub fn float_pow() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    powi_f32(&mut criterion);
+    powi_f64(&mut criterion);
+
+    #[cfg(all(f128_enabled, not(feature = "no-sys-f128")))]
+    powi_f128(&mut criterion);
+}
+
+criterion_main!(float_pow);
--- a/library/compiler-builtins/builtins-test/benches/float_sub.rs
+++ b/library/compiler-builtins/builtins-test/benches/float_sub.rs
@ -0,0 +1,93 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::float_bench;
+use compiler_builtins::float::sub;
+use criterion::{Criterion, criterion_main};
+
+float_bench! {
+    name: sub_f32,
+    sig: (a: f32, b: f32) -> f32,
+    crate_fn: sub::__subsf3,
+    sys_fn: __subsf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "subss {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fsub {a:s}, {a:s}, {b:s}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+float_bench! {
+    name: sub_f64,
+    sig: (a: f64, b: f64) -> f64,
+    crate_fn: sub::__subdf3,
+    sys_fn: __subdf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "subsd {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fsub {a:d}, {a:d}, {b:d}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: sub_f128,
+    sig: (a: f128, b: f128) -> f128,
+    crate_fn: sub::__subtf3,
+    crate_fn_ppc: sub::__subkf3,
+    sys_fn: __subtf3,
+    sys_fn_ppc: __subkf3,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: []
+}
+
+pub fn float_sub() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    sub_f32(&mut criterion);
+    sub_f64(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    {
+        sub_f128(&mut criterion);
+    }
+}
+
+criterion_main!(float_sub);
--- a/library/compiler-builtins/builtins-test/benches/float_trunc.rs
+++ b/library/compiler-builtins/builtins-test/benches/float_trunc.rs
@ -0,0 +1,146 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+#![cfg_attr(f16_enabled, feature(f16))]
+
+use builtins_test::float_bench;
+use compiler_builtins::float::trunc;
+use criterion::{Criterion, criterion_main};
+
+#[cfg(f16_enabled)]
+float_bench! {
+    name: trunc_f32_f16,
+    sig: (a: f32) -> f16,
+    crate_fn: trunc::__truncsfhf2,
+    sys_fn: __truncsfhf2,
+    sys_available: not(feature = "no-sys-f16"),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f16;
+            asm!(
+                "fcvt    {ret:h}, {a:s}",
+                a = in(vreg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(f16_enabled)]
+float_bench! {
+    name: trunc_f64_f16,
+    sig: (a: f64) -> f16,
+    crate_fn: trunc::__truncdfhf2,
+    sys_fn: __truncdfhf2,
+    sys_available: not(feature = "no-sys-f16-f64-convert"),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f16;
+            asm!(
+                "fcvt    {ret:h}, {a:d}",
+                a = in(vreg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: trunc_f64_f32,
+    sig: (a: f64) -> f32,
+    crate_fn: trunc::__truncdfsf2,
+    sys_fn: __truncdfsf2,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: f32;
+            asm!(
+                "cvtsd2ss {ret}, {a}",
+                a = in(xmm_reg) a,
+                ret = lateout(xmm_reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f32;
+            asm!(
+                "fcvt    {ret:s}, {a:d}",
+                a = in(vreg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(all(f16_enabled, f128_enabled))]
+float_bench! {
+    name: trunc_f128_f16,
+    sig: (a: f128) -> f16,
+    crate_fn: trunc::__trunctfhf2,
+    crate_fn_ppc: trunc::__trunckfhf2,
+    sys_fn: __trunctfhf2,
+    sys_fn_ppc: __trunckfhf2,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: [],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: trunc_f128_f32,
+    sig: (a: f128) -> f32,
+    crate_fn: trunc::__trunctfsf2,
+    crate_fn_ppc: trunc::__trunckfsf2,
+    sys_fn: __trunctfsf2,
+    sys_fn_ppc: __trunckfsf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: [],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: trunc_f128_f64,
+    sig: (a: f128) -> f64,
+    crate_fn: trunc::__trunctfdf2,
+    crate_fn_ppc: trunc::__trunckfdf2,
+    sys_fn: __trunctfdf2,
+    sys_fn_ppc: __trunckfdf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: [],
+}
+
+pub fn float_trunc() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    // FIXME(#655): `f16` tests disabled until we can bootstrap symbols
+    #[cfg(f16_enabled)]
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    {
+        trunc_f32_f16(&mut criterion);
+        trunc_f64_f16(&mut criterion);
+    }
+
+    trunc_f64_f32(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    {
+        // FIXME(#655): `f16` tests disabled until we can bootstrap symbols
+        #[cfg(f16_enabled)]
+        #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+        trunc_f128_f16(&mut criterion);
+
+        trunc_f128_f32(&mut criterion);
+        trunc_f128_f64(&mut criterion);
+    }
+}
+
+criterion_main!(float_trunc);
--- a/library/compiler-builtins/builtins-test/benches/mem.rs
+++ b/library/compiler-builtins/builtins-test/benches/mem.rs
@ -0,0 +1,364 @@
+#![feature(test)]
+
+extern crate test;
+use test::{Bencher, black_box};
+
+extern crate compiler_builtins;
+use compiler_builtins::mem::{memcmp, memcpy, memmove, memset};
+
+const WORD_SIZE: usize = core::mem::size_of::<usize>();
+
+struct AlignedVec {
+    vec: Vec<usize>,
+    size: usize,
+}
+
+impl AlignedVec {
+    fn new(fill: u8, size: usize) -> Self {
+        let mut broadcast = fill as usize;
+        let mut bits = 8;
+        while bits < WORD_SIZE * 8 {
+            broadcast |= broadcast << bits;
+            bits *= 2;
+        }
+
+        let vec = vec![broadcast; (size + WORD_SIZE - 1) & !WORD_SIZE];
+        AlignedVec { vec, size }
+    }
+}
+
+impl core::ops::Deref for AlignedVec {
+    type Target = [u8];
+    fn deref(&self) -> &[u8] {
+        unsafe { core::slice::from_raw_parts(self.vec.as_ptr() as *const u8, self.size) }
+    }
+}
+
+impl core::ops::DerefMut for AlignedVec {
+    fn deref_mut(&mut self) -> &mut [u8] {
+        unsafe { core::slice::from_raw_parts_mut(self.vec.as_mut_ptr() as *mut u8, self.size) }
+    }
+}
+
+fn memcpy_builtin(b: &mut Bencher, n: usize, offset1: usize, offset2: usize) {
+    let v1 = AlignedVec::new(1, n + offset1);
+    let mut v2 = AlignedVec::new(0, n + offset2);
+    b.bytes = n as u64;
+    b.iter(|| {
+        let src: &[u8] = black_box(&v1[offset1..]);
+        let dst: &mut [u8] = black_box(&mut v2[offset2..]);
+        dst.copy_from_slice(src);
+    })
+}
+
+fn memcpy_rust(b: &mut Bencher, n: usize, offset1: usize, offset2: usize) {
+    let v1 = AlignedVec::new(1, n + offset1);
+    let mut v2 = AlignedVec::new(0, n + offset2);
+    b.bytes = n as u64;
+    b.iter(|| {
+        let src: &[u8] = black_box(&v1[offset1..]);
+        let dst: &mut [u8] = black_box(&mut v2[offset2..]);
+        unsafe { memcpy(dst.as_mut_ptr(), src.as_ptr(), n) }
+    })
+}
+
+fn memset_builtin(b: &mut Bencher, n: usize, offset: usize) {
+    let mut v1 = AlignedVec::new(0, n + offset);
+    b.bytes = n as u64;
+    b.iter(|| {
+        let dst: &mut [u8] = black_box(&mut v1[offset..]);
+        let val: u8 = black_box(27);
+        for b in dst {
+            *b = val;
+        }
+    })
+}
+
+fn memset_rust(b: &mut Bencher, n: usize, offset: usize) {
+    let mut v1 = AlignedVec::new(0, n + offset);
+    b.bytes = n as u64;
+    b.iter(|| {
+        let dst: &mut [u8] = black_box(&mut v1[offset..]);
+        let val = black_box(27);
+        unsafe { memset(dst.as_mut_ptr(), val, n) }
+    })
+}
+
+fn memcmp_builtin(b: &mut Bencher, n: usize) {
+    let v1 = AlignedVec::new(0, n);
+    let mut v2 = AlignedVec::new(0, n);
+    v2[n - 1] = 1;
+    b.bytes = n as u64;
+    b.iter(|| {
+        let s1: &[u8] = black_box(&v1);
+        let s2: &[u8] = black_box(&v2);
+        s1.cmp(s2)
+    })
+}
+
+fn memcmp_builtin_unaligned(b: &mut Bencher, n: usize) {
+    let v1 = AlignedVec::new(0, n);
+    let mut v2 = AlignedVec::new(0, n);
+    v2[n - 1] = 1;
+    b.bytes = n as u64;
+    b.iter(|| {
+        let s1: &[u8] = black_box(&v1[0..]);
+        let s2: &[u8] = black_box(&v2[1..]);
+        s1.cmp(s2)
+    })
+}
+
+fn memcmp_rust(b: &mut Bencher, n: usize) {
+    let v1 = AlignedVec::new(0, n);
+    let mut v2 = AlignedVec::new(0, n);
+    v2[n - 1] = 1;
+    b.bytes = n as u64;
+    b.iter(|| {
+        let s1: &[u8] = black_box(&v1);
+        let s2: &[u8] = black_box(&v2);
+        unsafe { memcmp(s1.as_ptr(), s2.as_ptr(), n) }
+    })
+}
+
+fn memcmp_rust_unaligned(b: &mut Bencher, n: usize) {
+    let v1 = AlignedVec::new(0, n);
+    let mut v2 = AlignedVec::new(0, n);
+    v2[n - 1] = 1;
+    b.bytes = n as u64;
+    b.iter(|| {
+        let s1: &[u8] = black_box(&v1[0..]);
+        let s2: &[u8] = black_box(&v2[1..]);
+        unsafe { memcmp(s1.as_ptr(), s2.as_ptr(), n - 1) }
+    })
+}
+
+fn memmove_builtin(b: &mut Bencher, n: usize, offset: usize) {
+    let mut v = AlignedVec::new(0, n + n / 2 + offset);
+    b.bytes = n as u64;
+    b.iter(|| {
+        let s: &mut [u8] = black_box(&mut v);
+        s.copy_within(0..n, n / 2 + offset);
+    })
+}
+
+fn memmove_rust(b: &mut Bencher, n: usize, offset: usize) {
+    let mut v = AlignedVec::new(0, n + n / 2 + offset);
+    b.bytes = n as u64;
+    b.iter(|| {
+        let dst: *mut u8 = black_box(&mut v[n / 2 + offset..]).as_mut_ptr();
+        let src: *const u8 = black_box(&v).as_ptr();
+        unsafe { memmove(dst, src, n) };
+    })
+}
+
+#[bench]
+fn memcpy_builtin_4096(b: &mut Bencher) {
+    memcpy_builtin(b, 4096, 0, 0)
+}
+#[bench]
+fn memcpy_rust_4096(b: &mut Bencher) {
+    memcpy_rust(b, 4096, 0, 0)
+}
+#[bench]
+fn memcpy_builtin_1048576(b: &mut Bencher) {
+    memcpy_builtin(b, 1048576, 0, 0)
+}
+#[bench]
+fn memcpy_rust_1048576(b: &mut Bencher) {
+    memcpy_rust(b, 1048576, 0, 0)
+}
+#[bench]
+fn memcpy_builtin_4096_offset(b: &mut Bencher) {
+    memcpy_builtin(b, 4096, 65, 65)
+}
+#[bench]
+fn memcpy_rust_4096_offset(b: &mut Bencher) {
+    memcpy_rust(b, 4096, 65, 65)
+}
+#[bench]
+fn memcpy_builtin_1048576_offset(b: &mut Bencher) {
+    memcpy_builtin(b, 1048576, 65, 65)
+}
+#[bench]
+fn memcpy_rust_1048576_offset(b: &mut Bencher) {
+    memcpy_rust(b, 1048576, 65, 65)
+}
+#[bench]
+fn memcpy_builtin_4096_misalign(b: &mut Bencher) {
+    memcpy_builtin(b, 4096, 65, 66)
+}
+#[bench]
+fn memcpy_rust_4096_misalign(b: &mut Bencher) {
+    memcpy_rust(b, 4096, 65, 66)
+}
+#[bench]
+fn memcpy_builtin_1048576_misalign(b: &mut Bencher) {
+    memcpy_builtin(b, 1048576, 65, 66)
+}
+#[bench]
+fn memcpy_rust_1048576_misalign(b: &mut Bencher) {
+    memcpy_rust(b, 1048576, 65, 66)
+}
+
+#[bench]
+fn memset_builtin_4096(b: &mut Bencher) {
+    memset_builtin(b, 4096, 0)
+}
+#[bench]
+fn memset_rust_4096(b: &mut Bencher) {
+    memset_rust(b, 4096, 0)
+}
+#[bench]
+fn memset_builtin_1048576(b: &mut Bencher) {
+    memset_builtin(b, 1048576, 0)
+}
+#[bench]
+fn memset_rust_1048576(b: &mut Bencher) {
+    memset_rust(b, 1048576, 0)
+}
+#[bench]
+fn memset_builtin_4096_offset(b: &mut Bencher) {
+    memset_builtin(b, 4096, 65)
+}
+#[bench]
+fn memset_rust_4096_offset(b: &mut Bencher) {
+    memset_rust(b, 4096, 65)
+}
+#[bench]
+fn memset_builtin_1048576_offset(b: &mut Bencher) {
+    memset_builtin(b, 1048576, 65)
+}
+#[bench]
+fn memset_rust_1048576_offset(b: &mut Bencher) {
+    memset_rust(b, 1048576, 65)
+}
+
+#[bench]
+fn memcmp_builtin_8(b: &mut Bencher) {
+    memcmp_builtin(b, 8)
+}
+#[bench]
+fn memcmp_rust_8(b: &mut Bencher) {
+    memcmp_rust(b, 8)
+}
+#[bench]
+fn memcmp_builtin_16(b: &mut Bencher) {
+    memcmp_builtin(b, 16)
+}
+#[bench]
+fn memcmp_rust_16(b: &mut Bencher) {
+    memcmp_rust(b, 16)
+}
+#[bench]
+fn memcmp_builtin_32(b: &mut Bencher) {
+    memcmp_builtin(b, 32)
+}
+#[bench]
+fn memcmp_rust_32(b: &mut Bencher) {
+    memcmp_rust(b, 32)
+}
+#[bench]
+fn memcmp_builtin_64(b: &mut Bencher) {
+    memcmp_builtin(b, 64)
+}
+#[bench]
+fn memcmp_rust_64(b: &mut Bencher) {
+    memcmp_rust(b, 64)
+}
+#[bench]
+fn memcmp_builtin_4096(b: &mut Bencher) {
+    memcmp_builtin(b, 4096)
+}
+#[bench]
+fn memcmp_rust_4096(b: &mut Bencher) {
+    memcmp_rust(b, 4096)
+}
+#[bench]
+fn memcmp_builtin_1048576(b: &mut Bencher) {
+    memcmp_builtin(b, 1048576)
+}
+#[bench]
+fn memcmp_rust_1048576(b: &mut Bencher) {
+    memcmp_rust(b, 1048576)
+}
+#[bench]
+fn memcmp_builtin_unaligned_7(b: &mut Bencher) {
+    memcmp_builtin_unaligned(b, 8)
+}
+#[bench]
+fn memcmp_rust_unaligned_7(b: &mut Bencher) {
+    memcmp_rust_unaligned(b, 8)
+}
+#[bench]
+fn memcmp_builtin_unaligned_15(b: &mut Bencher) {
+    memcmp_builtin_unaligned(b, 16)
+}
+#[bench]
+fn memcmp_rust_unaligned_15(b: &mut Bencher) {
+    memcmp_rust_unaligned(b, 16)
+}
+#[bench]
+fn memcmp_builtin_unaligned_31(b: &mut Bencher) {
+    memcmp_builtin_unaligned(b, 32)
+}
+#[bench]
+fn memcmp_rust_unaligned_31(b: &mut Bencher) {
+    memcmp_rust_unaligned(b, 32)
+}
+#[bench]
+fn memcmp_builtin_unaligned_63(b: &mut Bencher) {
+    memcmp_builtin_unaligned(b, 64)
+}
+#[bench]
+fn memcmp_rust_unaligned_63(b: &mut Bencher) {
+    memcmp_rust_unaligned(b, 64)
+}
+#[bench]
+fn memcmp_builtin_unaligned_4095(b: &mut Bencher) {
+    memcmp_builtin_unaligned(b, 4096)
+}
+#[bench]
+fn memcmp_rust_unaligned_4095(b: &mut Bencher) {
+    memcmp_rust_unaligned(b, 4096)
+}
+#[bench]
+fn memcmp_builtin_unaligned_1048575(b: &mut Bencher) {
+    memcmp_builtin_unaligned(b, 1048576)
+}
+#[bench]
+fn memcmp_rust_unaligned_1048575(b: &mut Bencher) {
+    memcmp_rust_unaligned(b, 1048576)
+}
+
+#[bench]
+fn memmove_builtin_4096(b: &mut Bencher) {
+    memmove_builtin(b, 4096, 0)
+}
+#[bench]
+fn memmove_rust_4096(b: &mut Bencher) {
+    memmove_rust(b, 4096, 0)
+}
+#[bench]
+fn memmove_builtin_1048576(b: &mut Bencher) {
+    memmove_builtin(b, 1048576, 0)
+}
+#[bench]
+fn memmove_rust_1048576(b: &mut Bencher) {
+    memmove_rust(b, 1048576, 0)
+}
+#[bench]
+fn memmove_builtin_4096_misalign(b: &mut Bencher) {
+    memmove_builtin(b, 4096, 1)
+}
+#[bench]
+fn memmove_rust_4096_misalign(b: &mut Bencher) {
+    memmove_rust(b, 4096, 1)
+}
+#[bench]
+fn memmove_builtin_1048576_misalign(b: &mut Bencher) {
+    memmove_builtin(b, 1048576, 1)
+}
+#[bench]
+fn memmove_rust_1048576_misalign(b: &mut Bencher) {
+    memmove_rust(b, 1048576, 1)
+}
--- a/library/compiler-builtins/builtins-test/benches/mem_icount.rs
+++ b/library/compiler-builtins/builtins-test/benches/mem_icount.rs
@ -0,0 +1,500 @@
+//! Benchmarks that use Callgrind (via `iai_callgrind`) to report instruction count metrics. This
+//! is stable enough to be tested in CI.
+
+use std::hint::black_box;
+use std::{ops, slice};
+
+use compiler_builtins::mem::{memcmp, memcpy, memmove, memset};
+use iai_callgrind::{library_benchmark, library_benchmark_group, main};
+
+const PAGE_SIZE: usize = 0x1000; // 4 kiB
+const MAX_ALIGN: usize = 512; // assume we may use avx512 operations one day
+const MEG1: usize = 1 << 20; // 1 MiB
+
+#[derive(Clone)]
+#[repr(C, align(0x1000))]
+struct Page([u8; PAGE_SIZE]);
+
+/// A buffer that is page-aligned by default, with an optional offset to create a
+/// misalignment.
+struct AlignedSlice {
+    buf: Box<[Page]>,
+    len: usize,
+    offset: usize,
+}
+
+impl AlignedSlice {
+    /// Allocate a slice aligned to ALIGN with at least `len` items, with `offset` from
+    /// page alignment.
+    fn new_zeroed(len: usize, offset: usize) -> Self {
+        assert!(offset < PAGE_SIZE);
+        let total_len = len + offset;
+        let items = (total_len / PAGE_SIZE) + if total_len % PAGE_SIZE > 0 { 1 } else { 0 };
+        let buf = vec![Page([0u8; PAGE_SIZE]); items].into_boxed_slice();
+        AlignedSlice { buf, len, offset }
+    }
+}
+
+impl ops::Deref for AlignedSlice {
+    type Target = [u8];
+    fn deref(&self) -> &Self::Target {
+        unsafe { slice::from_raw_parts(self.buf.as_ptr().cast::<u8>().add(self.offset), self.len) }
+    }
+}
+
+impl ops::DerefMut for AlignedSlice {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        unsafe {
+            slice::from_raw_parts_mut(
+                self.buf.as_mut_ptr().cast::<u8>().add(self.offset),
+                self.len,
+            )
+        }
+    }
+}
+
+mod mcpy {
+    use super::*;
+
+    struct Cfg {
+        len: usize,
+        s_off: usize,
+        d_off: usize,
+    }
+
+    fn setup(cfg: Cfg) -> (usize, AlignedSlice, AlignedSlice) {
+        let Cfg { len, s_off, d_off } = cfg;
+        println!("bytes: {len} bytes, src offset: {s_off}, dst offset: {d_off}");
+        let mut src = AlignedSlice::new_zeroed(len, s_off);
+        let dst = AlignedSlice::new_zeroed(len, d_off);
+        src.fill(1);
+        (len, src, dst)
+    }
+
+    #[library_benchmark]
+    #[benches::aligned(
+        // Both aligned
+        args = [
+            Cfg { len: 16, s_off: 0, d_off: 0 },
+            Cfg { len: 32, s_off: 0, d_off: 0 },
+            Cfg { len: 64, s_off: 0, d_off: 0 },
+            Cfg { len: 512, s_off: 0, d_off: 0 },
+            Cfg { len: 4096, s_off: 0, d_off: 0 },
+            Cfg { len: MEG1, s_off: 0, d_off: 0 },
+        ],
+        setup = setup,
+    )]
+    #[benches::offset(
+        // Both at the same offset
+        args = [
+            Cfg { len: 16, s_off: 65, d_off: 65 },
+            Cfg { len: 32, s_off: 65, d_off: 65 },
+            Cfg { len: 64, s_off: 65, d_off: 65 },
+            Cfg { len: 512, s_off: 65, d_off: 65 },
+            Cfg { len: 4096, s_off: 65, d_off: 65 },
+            Cfg { len: MEG1, s_off: 65, d_off: 65 },
+        ],
+        setup = setup,
+    )]
+    #[benches::misaligned(
+        // `src` and `dst` both misaligned by different amounts
+        args = [
+            Cfg { len: 16, s_off: 65, d_off: 66 },
+            Cfg { len: 32, s_off: 65, d_off: 66 },
+            Cfg { len: 64, s_off: 65, d_off: 66 },
+            Cfg { len: 512, s_off: 65, d_off: 66 },
+            Cfg { len: 4096, s_off: 65, d_off: 66 },
+            Cfg { len: MEG1, s_off: 65, d_off: 66 },
+        ],
+        setup = setup,
+    )]
+    fn bench((len, mut dst, src): (usize, AlignedSlice, AlignedSlice)) {
+        unsafe {
+            black_box(memcpy(
+                black_box(dst.as_mut_ptr()),
+                black_box(src.as_ptr()),
+                black_box(len),
+            ));
+        }
+    }
+
+    library_benchmark_group!(name = memcpy; benchmarks = bench);
+}
+
+mod mset {
+    use super::*;
+
+    struct Cfg {
+        len: usize,
+        offset: usize,
+    }
+
+    fn setup(Cfg { len, offset }: Cfg) -> (usize, AlignedSlice) {
+        println!("bytes: {len}, offset: {offset}");
+        (len, AlignedSlice::new_zeroed(len, offset))
+    }
+
+    #[library_benchmark]
+    #[benches::aligned(
+        args = [
+            Cfg { len: 16, offset: 0 },
+            Cfg { len: 32, offset: 0 },
+            Cfg { len: 64, offset: 0 },
+            Cfg { len: 512, offset: 0 },
+            Cfg { len: 4096, offset: 0 },
+            Cfg { len: MEG1, offset: 0 },
+        ],
+        setup = setup,
+    )]
+    #[benches::offset(
+        args = [
+            Cfg { len: 16, offset: 65 },
+            Cfg { len: 32, offset: 65 },
+            Cfg { len: 64, offset: 65 },
+            Cfg { len: 512, offset: 65 },
+            Cfg { len: 4096, offset: 65 },
+            Cfg { len: MEG1, offset: 65 },
+        ],
+        setup = setup,
+    )]
+    fn bench((len, mut dst): (usize, AlignedSlice)) {
+        unsafe {
+            black_box(memset(
+                black_box(dst.as_mut_ptr()),
+                black_box(27),
+                black_box(len),
+            ));
+        }
+    }
+
+    library_benchmark_group!(name = memset; benchmarks = bench);
+}
+
+mod mcmp {
+    use super::*;
+
+    struct Cfg {
+        len: usize,
+        s_off: usize,
+        d_off: usize,
+    }
+
+    fn setup(cfg: Cfg) -> (usize, AlignedSlice, AlignedSlice) {
+        let Cfg { len, s_off, d_off } = cfg;
+        println!("bytes: {len}, src offset: {s_off}, dst offset: {d_off}");
+        let b1 = AlignedSlice::new_zeroed(len, s_off);
+        let mut b2 = AlignedSlice::new_zeroed(len, d_off);
+        b2[len - 1] = 1;
+        (len, b1, b2)
+    }
+
+    #[library_benchmark]
+    #[benches::aligned(
+        // Both aligned
+        args = [
+            Cfg { len: 16, s_off: 0, d_off: 0 },
+            Cfg { len: 32, s_off: 0, d_off: 0 },
+            Cfg { len: 64, s_off: 0, d_off: 0 },
+            Cfg { len: 512, s_off: 0, d_off: 0 },
+            Cfg { len: 4096, s_off: 0, d_off: 0 },
+            Cfg { len: MEG1, s_off: 0, d_off: 0 },
+        ],
+        setup = setup
+    )]
+    #[benches::offset(
+        // Both at the same offset
+        args = [
+            Cfg { len: 16, s_off: 65, d_off: 65 },
+            Cfg { len: 32, s_off: 65, d_off: 65 },
+            Cfg { len: 64, s_off: 65, d_off: 65 },
+            Cfg { len: 512, s_off: 65, d_off: 65 },
+            Cfg { len: 4096, s_off: 65, d_off: 65 },
+            Cfg { len: MEG1, s_off: 65, d_off: 65 },
+        ],
+        setup = setup
+    )]
+    #[benches::misaligned(
+        // `src` and `dst` both misaligned by different amounts
+        args = [
+            Cfg { len: 16, s_off: 65, d_off: 66 },
+            Cfg { len: 32, s_off: 65, d_off: 66 },
+            Cfg { len: 64, s_off: 65, d_off: 66 },
+            Cfg { len: 512, s_off: 65, d_off: 66 },
+            Cfg { len: 4096, s_off: 65, d_off: 66 },
+            Cfg { len: MEG1, s_off: 65, d_off: 66 },
+        ],
+        setup = setup
+    )]
+    fn bench((len, mut dst, src): (usize, AlignedSlice, AlignedSlice)) {
+        unsafe {
+            black_box(memcmp(
+                black_box(dst.as_mut_ptr()),
+                black_box(src.as_ptr()),
+                black_box(len),
+            ));
+        }
+    }
+
+    library_benchmark_group!(name = memcmp; benchmarks = bench);
+}
+
+mod mmove {
+    use Spread::{Aligned, Large, Medium, Small};
+
+    use super::*;
+
+    struct Cfg {
+        len: usize,
+        spread: Spread,
+        off: usize,
+    }
+
+    enum Spread {
+        /// `src` and `dst` are close and have the same alignment (or offset).
+        Aligned,
+        /// `src` and `dst` are close.
+        Small,
+        /// `src` and `dst` are halfway offset in the buffer.
+        Medium,
+        /// `src` and `dst` only overlap by a single byte.
+        Large,
+    }
+
+    // Note that small and large are
+    fn calculate_spread(len: usize, spread: Spread) -> usize {
+        match spread {
+            // Note that this test doesn't make sense for lengths less than len=128
+            Aligned => {
+                assert!(len > MAX_ALIGN, "aligned memset would have no overlap");
+                MAX_ALIGN
+            }
+            Small => 1,
+            Medium => (len / 2) + 1, // add 1 so all are misaligned
+            Large => len - 1,
+        }
+    }
+
+    fn setup_forward(cfg: Cfg) -> (usize, usize, AlignedSlice) {
+        let Cfg { len, spread, off } = cfg;
+        let spread = calculate_spread(len, spread);
+        println!("bytes: {len}, spread: {spread}, offset: {off}, forward");
+        assert!(spread < len, "memmove tests should have some overlap");
+        let mut buf = AlignedSlice::new_zeroed(len + spread, off);
+        let mut fill: usize = 0;
+        buf[..len].fill_with(|| {
+            fill += 1;
+            fill as u8
+        });
+        (len, spread, buf)
+    }
+
+    fn setup_backward(cfg: Cfg) -> (usize, usize, AlignedSlice) {
+        let Cfg { len, spread, off } = cfg;
+        let spread = calculate_spread(len, spread);
+        println!("bytes: {len}, spread: {spread}, offset: {off}, backward");
+        assert!(spread < len, "memmove tests should have some overlap");
+        let mut buf = AlignedSlice::new_zeroed(len + spread, off);
+        let mut fill: usize = 0;
+        buf[spread..].fill_with(|| {
+            fill += 1;
+            fill as u8
+        });
+        (len, spread, buf)
+    }
+
+    #[library_benchmark]
+    #[benches::aligned(
+        args = [
+            // Don't test small spreads since there is no overlap
+            Cfg { len: 4096, spread: Aligned, off: 0 },
+            Cfg { len: MEG1, spread: Aligned, off: 0 },
+        ],
+        setup = setup_forward
+    )]
+    #[benches::small_spread(
+        args = [
+            Cfg { len: 16, spread: Small, off: 0 },
+            Cfg { len: 32, spread: Small, off: 0 },
+            Cfg { len: 64, spread: Small, off: 0 },
+            Cfg { len: 512, spread: Small, off: 0 },
+            Cfg { len: 4096, spread: Small, off: 0 },
+            Cfg { len: MEG1, spread: Small, off: 0 },
+        ],
+        setup = setup_forward
+    )]
+    #[benches::medium_spread(
+        args = [
+            Cfg { len: 16, spread: Medium, off: 0 },
+            Cfg { len: 32, spread: Medium, off: 0 },
+            Cfg { len: 64, spread: Medium, off: 0 },
+            Cfg { len: 512, spread: Medium, off: 0 },
+            Cfg { len: 4096, spread: Medium, off: 0 },
+            Cfg { len: MEG1, spread: Medium, off: 0 },
+        ],
+        setup = setup_forward
+    )]
+    #[benches::large_spread(
+        args = [
+            Cfg { len: 16, spread: Large, off: 0 },
+            Cfg { len: 32, spread: Large, off: 0 },
+            Cfg { len: 64, spread: Large, off: 0 },
+            Cfg { len: 512, spread: Large, off: 0 },
+            Cfg { len: 4096, spread: Large, off: 0 },
+            Cfg { len: MEG1, spread: Large, off: 0 },
+        ],
+        setup = setup_forward
+    )]
+    #[benches::aligned_off(
+        args = [
+            Cfg { len: 4096, spread: Aligned, off: 65 },
+            Cfg { len: MEG1, spread: Aligned, off: 65 },
+        ],
+        setup = setup_forward
+    )]
+    #[benches::small_spread_off(
+        args = [
+            Cfg { len: 16, spread: Small, off: 65 },
+            Cfg { len: 32, spread: Small, off: 65 },
+            Cfg { len: 64, spread: Small, off: 65 },
+            Cfg { len: 512, spread: Small, off: 65 },
+            Cfg { len: 4096, spread: Small, off: 65 },
+            Cfg { len: MEG1, spread: Small, off: 65 },
+        ],
+        setup = setup_forward
+    )]
+    #[benches::medium_spread_off(
+        args = [
+            Cfg { len: 16, spread: Medium, off: 65 },
+            Cfg { len: 32, spread: Medium, off: 65 },
+            Cfg { len: 64, spread: Medium, off: 65 },
+            Cfg { len: 512, spread: Medium, off: 65 },
+            Cfg { len: 4096, spread: Medium, off: 65 },
+            Cfg { len: MEG1, spread: Medium, off: 65 },
+        ],
+        setup = setup_forward
+    )]
+    #[benches::large_spread_off(
+        args = [
+            Cfg { len: 16, spread: Large, off: 65 },
+            Cfg { len: 32, spread: Large, off: 65 },
+            Cfg { len: 64, spread: Large, off: 65 },
+            Cfg { len: 512, spread: Large, off: 65 },
+            Cfg { len: 4096, spread: Large, off: 65 },
+            Cfg { len: MEG1, spread: Large, off: 65 },
+        ],
+        setup = setup_forward
+    )]
+    fn forward((len, spread, mut buf): (usize, usize, AlignedSlice)) {
+        // Test moving from the start of the buffer toward the end
+        unsafe {
+            black_box(memmove(
+                black_box(buf[spread..].as_mut_ptr()),
+                black_box(buf.as_ptr()),
+                black_box(len),
+            ));
+        }
+    }
+
+    #[library_benchmark]
+    #[benches::aligned(
+        args = [
+            // Don't test small spreads since there is no overlap
+            Cfg { len: 4096, spread: Aligned, off: 0 },
+            Cfg { len: MEG1, spread: Aligned, off: 0 },
+        ],
+        setup = setup_backward
+    )]
+    #[benches::small_spread(
+        args = [
+            Cfg { len: 16, spread: Small, off: 0 },
+            Cfg { len: 32, spread: Small, off: 0 },
+            Cfg { len: 64, spread: Small, off: 0 },
+            Cfg { len: 512, spread: Small, off: 0 },
+            Cfg { len: 4096, spread: Small, off: 0 },
+            Cfg { len: MEG1, spread: Small, off: 0 },
+        ],
+        setup = setup_backward
+    )]
+    #[benches::medium_spread(
+        args = [
+            Cfg { len: 16, spread: Medium, off: 0 },
+            Cfg { len: 32, spread: Medium, off: 0 },
+            Cfg { len: 64, spread: Medium, off: 0 },
+            Cfg { len: 512, spread: Medium, off: 0 },
+            Cfg { len: 4096, spread: Medium, off: 0 },
+            Cfg { len: MEG1, spread: Medium, off: 0 },
+        ],
+        setup = setup_backward
+    )]
+    #[benches::large_spread(
+        args = [
+            Cfg { len: 16, spread: Large, off: 0 },
+            Cfg { len: 32, spread: Large, off: 0 },
+            Cfg { len: 64, spread: Large, off: 0 },
+            Cfg { len: 512, spread: Large, off: 0 },
+            Cfg { len: 4096, spread: Large, off: 0 },
+            Cfg { len: MEG1, spread: Large, off: 0 },
+        ],
+        setup = setup_backward
+    )]
+    #[benches::aligned_off(
+        args = [
+            // Don't test small spreads since there is no overlap
+            Cfg { len: 4096, spread: Aligned, off: 65 },
+            Cfg { len: MEG1, spread: Aligned, off: 65 },
+        ],
+        setup = setup_backward
+    )]
+    #[benches::small_spread_off(
+        args = [
+            Cfg { len: 16, spread: Small, off: 65 },
+            Cfg { len: 32, spread: Small, off: 65 },
+            Cfg { len: 64, spread: Small, off: 65 },
+            Cfg { len: 512, spread: Small, off: 65 },
+            Cfg { len: 4096, spread: Small, off: 65 },
+            Cfg { len: MEG1, spread: Small, off: 65 },
+        ],
+        setup = setup_backward
+    )]
+    #[benches::medium_spread_off(
+        args = [
+            Cfg { len: 16, spread: Medium, off: 65 },
+            Cfg { len: 32, spread: Medium, off: 65 },
+            Cfg { len: 64, spread: Medium, off: 65 },
+            Cfg { len: 512, spread: Medium, off: 65 },
+            Cfg { len: 4096, spread: Medium, off: 65 },
+            Cfg { len: MEG1, spread: Medium, off: 65 },
+        ],
+        setup = setup_backward
+    )]
+    #[benches::large_spread_off(
+        args = [
+            Cfg { len: 16, spread: Large, off: 65 },
+            Cfg { len: 32, spread: Large, off: 65 },
+            Cfg { len: 64, spread: Large, off: 65 },
+            Cfg { len: 512, spread: Large, off: 65 },
+            Cfg { len: 4096, spread: Large, off: 65 },
+            Cfg { len: MEG1, spread: Large, off: 65 },
+        ],
+        setup = setup_backward
+    )]
+    fn backward((len, spread, mut buf): (usize, usize, AlignedSlice)) {
+        // Test moving from the end of the buffer toward the start
+        unsafe {
+            black_box(memmove(
+                black_box(buf.as_mut_ptr()),
+                black_box(buf[spread..].as_ptr()),
+                black_box(len),
+            ));
+        }
+    }
+
+    library_benchmark_group!(name = memmove; benchmarks = forward, backward);
+}
+
+use mcmp::memcmp;
+use mcpy::memcpy;
+use mmove::memmove;
+use mset::memset;
+
+main!(library_benchmark_groups = memcpy, memset, memcmp, memmove);
--- a/library/compiler-builtins/builtins-test/build.rs
+++ b/library/compiler-builtins/builtins-test/build.rs
@ -0,0 +1,120 @@
+use std::collections::HashSet;
+
+mod builtins_configure {
+    include!("../compiler-builtins/configure.rs");
+}
+
+/// Features to enable
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+enum Feature {
+    NoSysF128,
+    NoSysF128IntConvert,
+    NoSysF16,
+    NoSysF16F64Convert,
+    NoSysF16F128Convert,
+}
+
+impl Feature {
+    fn implies(self) -> &'static [Self] {
+        match self {
+            Self::NoSysF128 => [Self::NoSysF128IntConvert, Self::NoSysF16F128Convert].as_slice(),
+            Self::NoSysF128IntConvert => [].as_slice(),
+            Self::NoSysF16 => [Self::NoSysF16F64Convert, Self::NoSysF16F128Convert].as_slice(),
+            Self::NoSysF16F64Convert => [].as_slice(),
+            Self::NoSysF16F128Convert => [].as_slice(),
+        }
+    }
+}
+
+fn main() {
+    println!("cargo::rerun-if-changed=../configure.rs");
+
+    let target = builtins_configure::Target::from_env();
+    let mut features = HashSet::new();
+
+    // These platforms do not have f128 symbols available in their system libraries, so
+    // skip related tests.
+    if target.arch == "arm"
+        || target.vendor == "apple"
+        || target.env == "msvc"
+        // GCC and LLVM disagree on the ABI of `f16` and `f128` with MinGW. See
+        // <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115054>.
+        || (target.os == "windows" && target.env == "gnu")
+        // FIXME(llvm): There is an ABI incompatibility between GCC and Clang on 32-bit x86.
+        // See <https://github.com/llvm/llvm-project/issues/77401>.
+        || target.arch == "x86"
+        // 32-bit PowerPC and 64-bit LE gets code generated that Qemu cannot handle. See
+        // <https://github.com/rust-lang/compiler-builtins/pull/606#issuecomment-2105635926>.
+        || target.arch == "powerpc"
+        || target.arch == "powerpc64le"
+        // FIXME: We get different results from the builtin functions. See
+        // <https://github.com/rust-lang/compiler-builtins/pull/606#issuecomment-2105657287>.
+        || target.arch == "powerpc64"
+    {
+        features.insert(Feature::NoSysF128);
+    }
+
+    if target.arch == "x86" {
+        // 32-bit x86 does not have `__fixunstfti`/`__fixtfti` but does have everything else
+        features.insert(Feature::NoSysF128IntConvert);
+        // FIXME: 32-bit x86 has a bug in `f128 -> f16` system libraries
+        features.insert(Feature::NoSysF16F128Convert);
+    }
+
+    // These platforms do not have f16 symbols available in their system libraries, so
+    // skip related tests. Most of these are missing `f16 <-> f32` conversion routines.
+    if (target.arch == "aarch64" && target.os == "linux")
+        || target.arch.starts_with("arm")
+        || target.arch == "powerpc"
+        || target.arch == "powerpc64"
+        || target.arch == "powerpc64le"
+        || target.arch == "loongarch64"
+        || (target.arch == "x86" && !target.has_feature("sse"))
+        || target.os == "windows"
+        // Linking says "error: function signature mismatch: __extendhfsf2" and seems to
+        // think the signature is either `(i32) -> f32` or `(f32) -> f32`. See
+        // <https://github.com/llvm/llvm-project/issues/96438>.
+        || target.arch == "wasm32"
+        || target.arch == "wasm64"
+    {
+        features.insert(Feature::NoSysF16);
+    }
+
+    // These platforms are missing either `__extendhfdf2` or `__truncdfhf2`.
+    if target.vendor == "apple" || target.os == "windows" {
+        features.insert(Feature::NoSysF16F64Convert);
+    }
+
+    // Add implied features. Collection is required for borrows.
+    features.extend(
+        features
+            .iter()
+            .flat_map(|x| x.implies())
+            .copied()
+            .collect::<Vec<_>>(),
+    );
+
+    for feature in features {
+        let (name, warning) = match feature {
+            Feature::NoSysF128 => ("no-sys-f128", "using apfloat fallback for f128"),
+            Feature::NoSysF128IntConvert => (
+                "no-sys-f128-int-convert",
+                "using apfloat fallback for f128 <-> int conversions",
+            ),
+            Feature::NoSysF16F64Convert => (
+                "no-sys-f16-f64-convert",
+                "using apfloat fallback for f16 <-> f64 conversions",
+            ),
+            Feature::NoSysF16F128Convert => (
+                "no-sys-f16-f128-convert",
+                "using apfloat fallback for f16 <-> f128 conversions",
+            ),
+            Feature::NoSysF16 => ("no-sys-f16", "using apfloat fallback for f16"),
+        };
+        println!("cargo:warning={warning}");
+        println!("cargo:rustc-cfg=feature=\"{name}\"");
+    }
+
+    builtins_configure::configure_aliases(&target);
+    builtins_configure::configure_f16_f128(&target);
+}
--- a/library/compiler-builtins/builtins-test/src/bench.rs
+++ b/library/compiler-builtins/builtins-test/src/bench.rs
@ -0,0 +1,366 @@
+use alloc::vec::Vec;
+use core::cell::RefCell;
+
+use compiler_builtins::float::Float;
+
+/// Fuzz with these many items to ensure equal functions
+pub const CHECK_ITER_ITEMS: u32 = 10_000;
+/// Benchmark with this many items to get a variety
+pub const BENCH_ITER_ITEMS: u32 = 500;
+
+/// Still run benchmarks/tests but don't check correctness between compiler-builtins and
+/// builtin system functions functions
+pub fn skip_sys_checks(test_name: &str) -> bool {
+    const ALWAYS_SKIPPED: &[&str] = &[
+        // FIXME(f16_f128): system symbols have incorrect results
+        // <https://github.com/rust-lang/compiler-builtins/issues/617>
+        "extend_f16_f32",
+        "trunc_f32_f16",
+        "trunc_f64_f16",
+        // FIXME(#616): re-enable once fix is in nightly
+        // <https://github.com/rust-lang/compiler-builtins/issues/616>
+        "mul_f32",
+        "mul_f64",
+    ];
+
+    // FIXME(f16_f128): error on LE ppc64. There are more tests that are cfg-ed out completely
+    // in their benchmark modules due to runtime panics.
+    // <https://github.com/rust-lang/compiler-builtins/issues/617#issuecomment-2125914639>
+    const PPC64LE_SKIPPED: &[&str] = &["extend_f32_f128"];
+
+    // FIXME(f16_f128): system symbols have incorrect results
+    // <https://github.com/rust-lang/compiler-builtins/issues/617#issuecomment-2125914639>
+    const X86_NO_SSE_SKIPPED: &[&str] = &[
+        "add_f128", "sub_f128", "mul_f128", "div_f128", "powi_f32", "powi_f64",
+    ];
+
+    // FIXME(f16_f128): Wide multiply carry bug in `compiler-rt`, re-enable when nightly no longer
+    // uses `compiler-rt` version.
+    // <https://github.com/llvm/llvm-project/issues/91840>
+    const AARCH64_SKIPPED: &[&str] = &["mul_f128", "div_f128"];
+
+    // FIXME(llvm): system symbols have incorrect results on Windows
+    // <https://github.com/rust-lang/compiler-builtins/issues/617#issuecomment-2121359807>
+    const WINDOWS_SKIPPED: &[&str] = &[
+        "conv_f32_u128",
+        "conv_f32_i128",
+        "conv_f64_u128",
+        "conv_f64_i128",
+    ];
+
+    if cfg!(target_arch = "arm") {
+        // The Arm symbols need a different ABI that our macro doesn't handle, just skip it
+        return true;
+    }
+
+    if ALWAYS_SKIPPED.contains(&test_name) {
+        return true;
+    }
+
+    if cfg!(all(target_arch = "powerpc64", target_endian = "little"))
+        && PPC64LE_SKIPPED.contains(&test_name)
+    {
+        return true;
+    }
+
+    if cfg!(all(target_arch = "x86", not(target_feature = "sse")))
+        && X86_NO_SSE_SKIPPED.contains(&test_name)
+    {
+        return true;
+    }
+
+    if cfg!(target_arch = "aarch64") && AARCH64_SKIPPED.contains(&test_name) {
+        return true;
+    }
+
+    if cfg!(target_family = "windows") && WINDOWS_SKIPPED.contains(&test_name) {
+        return true;
+    }
+
+    false
+}
+
+/// Still run benchmarks/tests but don't check correctness between compiler-builtins and
+/// assembly functions
+pub fn skip_asm_checks(_test_name: &str) -> bool {
+    // Nothing to skip at this time
+    false
+}
+
+/// Create a comparison of the system symbol, compiler_builtins, and optionally handwritten
+/// assembly.
+///
+/// # Safety
+///
+/// The signature must be correct and any assembly must be sound.
+#[macro_export]
+macro_rules! float_bench {
+    (
+        // Name of this benchmark
+        name: $name:ident,
+        // The function signature to be tested
+        sig: ($($arg:ident: $arg_ty:ty),*) -> $ret_ty:ty,
+        // Path to the crate in compiler_builtins
+        crate_fn: $crate_fn:path,
+        // Optional alias on ppc
+        $( crate_fn_ppc: $crate_fn_ppc:path, )?
+        // Name of the system symbol
+        sys_fn: $sys_fn:ident,
+        // Optional alias on ppc
+        $( sys_fn_ppc: $sys_fn_ppc:path, )?
+        // Meta saying whether the system symbol is available
+        sys_available: $sys_available:meta,
+        // An optional function to validate the results of two functions are equal, if not
+        // just `$ret_ty::check_eq`
+        $( output_eq: $output_eq:expr, )?
+        // Assembly implementations, if any.
+        asm: [
+            $(
+                #[cfg($asm_meta:meta)] {
+                    $($asm_tt:tt)*
+                }
+            );*
+            $(;)?
+        ]
+        $(,)?
+    ) => {paste::paste! {
+        // SAFETY: macro invocation must use the correct signature
+        #[cfg($sys_available)]
+        unsafe extern "C" {
+            /// Binding for the system function
+            #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+            fn $sys_fn($($arg: $arg_ty),*) -> $ret_ty;
+
+
+            #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+            float_bench! { @coalesce_fn $($sys_fn_ppc)? =>
+                fn $sys_fn($($arg: $arg_ty),*) -> $ret_ty;
+            }
+        }
+
+        fn $name(c: &mut Criterion) {
+            use core::hint::black_box;
+            use compiler_builtins::float::Float;
+            use $crate::bench::TestIO;
+
+            #[inline(never)] // equalize with external calls
+            fn crate_fn($($arg: $arg_ty),*) -> $ret_ty {
+                #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+                let target_crate_fn = $crate_fn;
+
+                // On PPC, use an alias if specified
+                #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+                let target_crate_fn = float_bench!(@coalesce $($crate_fn_ppc)?, $crate_fn);
+
+                target_crate_fn( $($arg),* )
+            }
+
+            #[inline(always)] // already a branch
+            #[cfg($sys_available)]
+            fn sys_fn($($arg: $arg_ty),*) -> $ret_ty {
+                #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+                let target_sys_fn = $sys_fn;
+
+                // On PPC, use an alias if specified
+                #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+                let target_sys_fn = float_bench!(@coalesce $($sys_fn_ppc)?, $sys_fn);
+
+                unsafe { target_sys_fn( $($arg),* ) }
+            }
+
+            #[inline(never)] // equalize with external calls
+            #[cfg(any( $($asm_meta),* ))]
+            fn asm_fn($(mut $arg: $arg_ty),*) -> $ret_ty {
+                use core::arch::asm;
+                $(
+                    #[cfg($asm_meta)]
+                    unsafe { $($asm_tt)* }
+                )*
+            }
+
+            let testvec = <($($arg_ty),*)>::make_testvec($crate::bench::CHECK_ITER_ITEMS);
+            let benchvec = <($($arg_ty),*)>::make_testvec($crate::bench::BENCH_ITER_ITEMS);
+            let test_name = stringify!($name);
+            let check_eq = float_bench!(@coalesce $($output_eq)?, $ret_ty::check_eq);
+
+            // Verify math lines up. We run the crate functions even if we don't validate the
+            // output here to make sure there are no panics or crashes.
+
+            #[cfg($sys_available)]
+            for ($($arg),*) in testvec.iter().copied() {
+                let crate_res = crate_fn($($arg),*);
+                let sys_res = sys_fn($($arg),*);
+
+                if $crate::bench::skip_sys_checks(test_name) {
+                    continue;
+                }
+
+                assert!(
+                    check_eq(crate_res, sys_res),
+                    "{test_name}{:?}: crate: {crate_res:?}, sys: {sys_res:?}",
+                    ($($arg),* ,)
+                );
+            }
+
+            #[cfg(any( $($asm_meta),* ))]
+            {
+                for ($($arg),*) in testvec.iter().copied() {
+                    let crate_res = crate_fn($($arg),*);
+                    let asm_res = asm_fn($($arg),*);
+
+                    if $crate::bench::skip_asm_checks(test_name) {
+                        continue;
+                    }
+
+                    assert!(
+                        check_eq(crate_res, asm_res),
+                        "{test_name}{:?}: crate: {crate_res:?}, asm: {asm_res:?}",
+                        ($($arg),* ,)
+                    );
+                }
+            }
+
+            let mut group = c.benchmark_group(test_name);
+            group.bench_function("compiler-builtins", |b| b.iter(|| {
+                for ($($arg),*) in benchvec.iter().copied() {
+                    black_box(crate_fn( $(black_box($arg)),* ));
+                }
+            }));
+
+            #[cfg($sys_available)]
+            group.bench_function("system", |b| b.iter(|| {
+                for ($($arg),*) in benchvec.iter().copied() {
+                    black_box(sys_fn( $(black_box($arg)),* ));
+                }
+            }));
+
+            #[cfg(any( $($asm_meta),* ))]
+            group.bench_function(&format!(
+                "assembly ({} {})", std::env::consts::ARCH, std::env::consts::FAMILY
+            ), |b| b.iter(|| {
+                for ($($arg),*) in benchvec.iter().copied() {
+                    black_box(asm_fn( $(black_box($arg)),* ));
+                }
+            }));
+
+            group.finish();
+        }
+    }};
+
+    // Allow overriding a default
+    (@coalesce $specified:expr, $default:expr) => { $specified };
+    (@coalesce, $default:expr) => { $default };
+
+    // Allow overriding a function name
+    (@coalesce_fn $specified:ident => fn $default_name:ident $($tt:tt)+) => {
+        fn $specified $($tt)+
+    };
+    (@coalesce_fn => fn $default_name:ident $($tt:tt)+) => {
+        fn $default_name $($tt)+
+    };
+}
+
+/// A type used as either an input or output to/from a benchmark function.
+pub trait TestIO: Sized {
+    fn make_testvec(len: u32) -> Vec<Self>;
+    fn check_eq(a: Self, b: Self) -> bool;
+}
+
+macro_rules! impl_testio {
+    (float $($f_ty:ty),+) => {$(
+        impl TestIO for $f_ty {
+            fn make_testvec(len: u32) -> Vec<Self> {
+                // refcell because fuzz_* takes a `Fn`
+                let ret = RefCell::new(Vec::new());
+                crate::fuzz_float(len, |a| ret.borrow_mut().push(a));
+                ret.into_inner()
+            }
+
+            fn check_eq(a: Self, b: Self) -> bool {
+                Float::eq_repr(a, b)
+            }
+        }
+
+        impl TestIO for ($f_ty, $f_ty) {
+            fn make_testvec(len: u32) -> Vec<Self> {
+                // refcell because fuzz_* takes a `Fn`
+                let ret = RefCell::new(Vec::new());
+                crate::fuzz_float_2(len, |a, b| ret.borrow_mut().push((a, b)));
+                ret.into_inner()
+            }
+
+            fn check_eq(_a: Self, _b: Self) -> bool {
+                unimplemented!()
+            }
+        }
+    )*};
+
+    (int $($i_ty:ty),+) => {$(
+        impl TestIO for $i_ty {
+            fn make_testvec(len: u32) -> Vec<Self> {
+                // refcell because fuzz_* takes a `Fn`
+                let ret = RefCell::new(Vec::new());
+                crate::fuzz(len, |a| ret.borrow_mut().push(a));
+                ret.into_inner()
+            }
+
+            fn check_eq(a: Self, b: Self) -> bool {
+                a == b
+            }
+        }
+
+        impl TestIO for ($i_ty, $i_ty) {
+            fn make_testvec(len: u32) -> Vec<Self> {
+                // refcell because fuzz_* takes a `Fn`
+                let ret = RefCell::new(Vec::new());
+                crate::fuzz_2(len, |a, b| ret.borrow_mut().push((a, b)));
+                ret.into_inner()
+            }
+
+            fn check_eq(_a: Self, _b: Self) -> bool {
+                unimplemented!()
+            }
+        }
+    )*};
+
+    ((float, int) ($f_ty:ty, $i_ty:ty)) => {
+        impl TestIO for ($f_ty, $i_ty) {
+            fn make_testvec(len: u32) -> Vec<Self> {
+                // refcell because fuzz_* takes a `Fn`
+                let ivec = RefCell::new(Vec::new());
+                let fvec = RefCell::new(Vec::new());
+
+                crate::fuzz(len.isqrt(), |a| ivec.borrow_mut().push(a));
+                crate::fuzz_float(len.isqrt(), |a| fvec.borrow_mut().push(a));
+
+                let mut ret = Vec::new();
+                let ivec = ivec.into_inner();
+                let fvec = fvec.into_inner();
+
+                for f in fvec {
+                    for i in &ivec {
+                        ret.push((f, *i));
+                    }
+                }
+
+                ret
+            }
+
+            fn check_eq(_a: Self, _b: Self) -> bool {
+                unimplemented!()
+            }
+        }
+    }
+}
+
+#[cfg(f16_enabled)]
+impl_testio!(float f16);
+impl_testio!(float f32, f64);
+#[cfg(f128_enabled)]
+impl_testio!(float f128);
+impl_testio!(int i16, i32, i64, i128);
+impl_testio!(int u16, u32, u64, u128);
+impl_testio!((float, int)(f32, i32));
+impl_testio!((float, int)(f64, i32));
+#[cfg(f128_enabled)]
+impl_testio!((float, int)(f128, i32));
--- a/library/compiler-builtins/builtins-test/src/lib.rs
+++ b/library/compiler-builtins/builtins-test/src/lib.rs
@ -0,0 +1,337 @@
+//! This crate is for integration testing and fuzz testing of functions in `compiler-builtins`. This
+//! includes publicly documented intrinsics and some internal alternative implementation functions
+//! such as `usize_leading_zeros_riscv` (which are tested because they are configured for
+//! architectures not tested by the CI).
+//!
+//! The general idea is to use a combination of edge case testing and randomized fuzz testing. The
+//! edge case testing is crucial for checking cases like where both inputs are equal or equal to
+//! special values such as `i128::MIN`, which is unlikely for the random fuzzer by itself to
+//! encounter. The randomized fuzz testing is specially designed to cover wide swaths of search
+//! space in as few iterations as possible. See `fuzz_values` in `builtins-test/tests/misc.rs` for
+//! an example.
+//!
+//! Some floating point tests are disabled for specific architectures, because they do not have
+//! correct rounding.
+#![no_std]
+#![cfg_attr(f128_enabled, feature(f128))]
+#![cfg_attr(f16_enabled, feature(f16))]
+
+pub mod bench;
+extern crate alloc;
+
+use compiler_builtins::float::Float;
+use compiler_builtins::int::{Int, MinInt};
+use rand_xoshiro::Xoshiro128StarStar;
+use rand_xoshiro::rand_core::{RngCore, SeedableRng};
+
+/// Sets the number of fuzz iterations run for most tests. In practice, the vast majority of bugs
+/// are caught by the edge case testers. Most of the remaining bugs triggered by more complex
+/// sequences are caught well within 10_000 fuzz iterations. For classes of algorithms like division
+/// that are vulnerable to rare edge cases, we want 1_000_000 iterations to be more confident. In
+/// practical CI, however, we only want to run the more strenuous test once to catch algorithmic
+/// level bugs, and run the 10_000 iteration test on most targets. Target-dependent bugs are likely
+/// to involve miscompilation and misconfiguration that is likely to break algorithms in quickly
+/// caught ways. We choose to configure `N = 1_000_000` iterations for `x86_64` targets (and if
+/// debug assertions are disabled. Tests without `--release` would take too long) which are likely
+/// to have fast hardware, and run `N = 10_000` for all other targets.
+pub const N: u32 = if cfg!(target_arch = "x86_64") && !cfg!(debug_assertions) {
+    1_000_000
+} else {
+    10_000
+};
+
+/// Random fuzzing step. When run several times, it results in excellent fuzzing entropy such as:
+/// 11110101010101011110111110011111
+/// 10110101010100001011101011001010
+/// 1000000000000000
+/// 10000000000000110111110000001010
+/// 1111011111111101010101111110101
+/// 101111111110100000000101000000
+/// 10000000110100000000100010101
+/// 1010101010101000
+fn fuzz_step<I: Int>(rng: &mut Xoshiro128StarStar, x: &mut I) {
+    let ones = !I::ZERO;
+    let bit_indexing_mask: u32 = I::BITS - 1;
+    // It happens that all the RNG we need can come from one call. 7 bits are needed to index a
+    // worst case 128 bit integer, and there are 4 indexes that need to be made plus 4 bits for
+    // selecting operations
+    let rng32 = rng.next_u32();
+
+    // Randomly OR, AND, and XOR randomly sized and shifted continuous strings of
+    // ones with `lhs` and `rhs`.
+    let r0 = bit_indexing_mask & rng32;
+    let r1 = bit_indexing_mask & (rng32 >> 7);
+    let mask = ones.wrapping_shl(r0).rotate_left(r1);
+    match (rng32 >> 14) % 4 {
+        0 => *x |= mask,
+        1 => *x &= mask,
+        // both 2 and 3 to make XORs as common as ORs and ANDs combined
+        _ => *x ^= mask,
+    }
+
+    // Alternating ones and zeros (e.x. 0b1010101010101010). This catches second-order
+    // problems that might occur for algorithms with two modes of operation (potentially
+    // there is some invariant that can be broken and maintained via alternating between modes,
+    // breaking the algorithm when it reaches the end).
+    let mut alt_ones = I::ONE;
+    for _ in 0..(I::BITS / 2) {
+        alt_ones <<= 2;
+        alt_ones |= I::ONE;
+    }
+    let r0 = bit_indexing_mask & (rng32 >> 16);
+    let r1 = bit_indexing_mask & (rng32 >> 23);
+    let mask = alt_ones.wrapping_shl(r0).rotate_left(r1);
+    match rng32 >> 30 {
+        0 => *x |= mask,
+        1 => *x &= mask,
+        _ => *x ^= mask,
+    }
+}
+
+// We need macros like this, because `#![no_std]` prevents us from using iterators
+macro_rules! edge_cases {
+    ($I:ident, $case:ident, $inner:block) => {
+        for i0 in 0..$I::FUZZ_NUM {
+            let mask_lo = (!$I::UnsignedInt::ZERO).wrapping_shr($I::FUZZ_LENGTHS[i0] as u32);
+            for i1 in i0..I::FUZZ_NUM {
+                let mask_hi =
+                    (!$I::UnsignedInt::ZERO).wrapping_shl($I::FUZZ_LENGTHS[i1 - i0] as u32);
+                let $case = I::from_unsigned(mask_lo & mask_hi);
+                $inner
+            }
+        }
+    };
+}
+
+/// Feeds a series of fuzzing inputs to `f`. The fuzzer first uses an algorithm designed to find
+/// edge cases, followed by a more random fuzzer that runs `n` times.
+pub fn fuzz<I: Int, F: FnMut(I)>(n: u32, mut f: F)
+where
+    <I as MinInt>::UnsignedInt: Int,
+{
+    // edge case tester. Calls `f` 210 times for u128.
+    // zero gets skipped by the loop
+    f(I::ZERO);
+    edge_cases!(I, case, {
+        f(case);
+    });
+
+    // random fuzzer
+    let mut rng = Xoshiro128StarStar::seed_from_u64(0);
+    let mut x: I = MinInt::ZERO;
+    for _ in 0..n {
+        fuzz_step(&mut rng, &mut x);
+        f(x)
+    }
+}
+
+/// The same as `fuzz`, except `f` has two inputs.
+pub fn fuzz_2<I: Int, F: Fn(I, I)>(n: u32, f: F)
+where
+    <I as MinInt>::UnsignedInt: Int,
+{
+    // Check cases where the first and second inputs are zero. Both call `f` 210 times for `u128`.
+    edge_cases!(I, case, {
+        f(I::ZERO, case);
+    });
+    edge_cases!(I, case, {
+        f(case, I::ZERO);
+    });
+    // Nested edge tester. Calls `f` 44100 times for `u128`.
+    edge_cases!(I, case0, {
+        edge_cases!(I, case1, {
+            f(case0, case1);
+        })
+    });
+
+    // random fuzzer
+    let mut rng = Xoshiro128StarStar::seed_from_u64(0);
+    let mut x: I = I::ZERO;
+    let mut y: I = I::ZERO;
+    for _ in 0..n {
+        fuzz_step(&mut rng, &mut x);
+        fuzz_step(&mut rng, &mut y);
+        f(x, y)
+    }
+}
+
+/// Tester for shift functions
+pub fn fuzz_shift<I: Int, F: Fn(I, u32)>(f: F) {
+    // Shift functions are very simple and do not need anything other than shifting a small
+    // set of random patterns for every fuzz length.
+    let mut rng = Xoshiro128StarStar::seed_from_u64(0);
+    let mut x: I = MinInt::ZERO;
+    for i in 0..I::FUZZ_NUM {
+        fuzz_step(&mut rng, &mut x);
+        f(x, MinInt::ZERO);
+        f(x, I::FUZZ_LENGTHS[i] as u32);
+    }
+}
+
+fn fuzz_float_step<F: Float>(rng: &mut Xoshiro128StarStar, f: &mut F) {
+    let rng32 = rng.next_u32();
+    // we need to fuzz the different parts of the float separately, because the masking on larger
+    // significands will tend to set the exponent to all ones or all zeros frequently
+
+    // sign bit fuzzing
+    let sign = (rng32 & 1) != 0;
+
+    // exponent fuzzing. Only 4 bits for the selector needed.
+    let ones = (F::Int::ONE << F::EXP_BITS) - F::Int::ONE;
+    let r0 = (rng32 >> 1) % F::EXP_BITS;
+    let r1 = (rng32 >> 5) % F::EXP_BITS;
+    // custom rotate shift. Note that `F::Int` is unsigned, so we can shift right without smearing
+    // the sign bit.
+    let mask = if r1 == 0 {
+        ones.wrapping_shr(r0)
+    } else {
+        let tmp = ones.wrapping_shr(r0);
+        (tmp.wrapping_shl(r1) | tmp.wrapping_shr(F::EXP_BITS - r1)) & ones
+    };
+    let mut exp = (f.to_bits() & F::EXP_MASK) >> F::SIG_BITS;
+    match (rng32 >> 9) % 4 {
+        0 => exp |= mask,
+        1 => exp &= mask,
+        _ => exp ^= mask,
+    }
+
+    // significand fuzzing
+    let mut sig = f.to_bits() & F::SIG_MASK;
+    fuzz_step(rng, &mut sig);
+    sig &= F::SIG_MASK;
+
+    *f = F::from_parts(sign, exp, sig);
+}
+
+macro_rules! float_edge_cases {
+    ($F:ident, $case:ident, $inner:block) => {
+        for exponent in [
+            F::Int::ZERO,
+            F::Int::ONE,
+            F::Int::ONE << (F::EXP_BITS / 2),
+            (F::Int::ONE << (F::EXP_BITS - 1)) - F::Int::ONE,
+            F::Int::ONE << (F::EXP_BITS - 1),
+            (F::Int::ONE << (F::EXP_BITS - 1)) + F::Int::ONE,
+            (F::Int::ONE << F::EXP_BITS) - F::Int::ONE,
+        ]
+        .iter()
+        {
+            for significand in [
+                F::Int::ZERO,
+                F::Int::ONE,
+                F::Int::ONE << (F::SIG_BITS / 2),
+                (F::Int::ONE << (F::SIG_BITS - 1)) - F::Int::ONE,
+                F::Int::ONE << (F::SIG_BITS - 1),
+                (F::Int::ONE << (F::SIG_BITS - 1)) + F::Int::ONE,
+                (F::Int::ONE << F::SIG_BITS) - F::Int::ONE,
+            ]
+            .iter()
+            {
+                for sign in [false, true].iter() {
+                    let $case = F::from_parts(*sign, *exponent, *significand);
+                    $inner
+                }
+            }
+        }
+    };
+}
+
+pub fn fuzz_float<F: Float, E: Fn(F)>(n: u32, f: E) {
+    float_edge_cases!(F, case, {
+        f(case);
+    });
+
+    // random fuzzer
+    let mut rng = Xoshiro128StarStar::seed_from_u64(0);
+    let mut x = F::ZERO;
+    for _ in 0..n {
+        fuzz_float_step(&mut rng, &mut x);
+        f(x);
+    }
+}
+
+pub fn fuzz_float_2<F: Float, E: Fn(F, F)>(n: u32, f: E) {
+    float_edge_cases!(F, case0, {
+        float_edge_cases!(F, case1, {
+            f(case0, case1);
+        });
+    });
+
+    // random fuzzer
+    let mut rng = Xoshiro128StarStar::seed_from_u64(0);
+    let mut x = F::ZERO;
+    let mut y = F::ZERO;
+    for _ in 0..n {
+        fuzz_float_step(&mut rng, &mut x);
+        fuzz_float_step(&mut rng, &mut y);
+        f(x, y)
+    }
+}
+
+/// Perform an operation using builtin types if available, falling back to apfloat if not.
+#[macro_export]
+macro_rules! apfloat_fallback {
+    (
+        $float_ty:ty,
+        // Type name in `rustc_apfloat::ieee`. Not a full path, it automatically gets the prefix.
+        $apfloat_ty:ident,
+        // Cfg expression for when builtin system operations should be used
+        $sys_available:meta,
+        // The expression to run. This expression may use `FloatTy` for its signature.
+        // Optionally, the final conversion back to a float can be suppressed using
+        // `=> no_convert` (for e.g. operations that return a bool).
+        //
+        // If the apfloat needs a different operation, it can be provided here.
+        $op:expr $(=> $convert:ident)? $(; $apfloat_op:expr)?,
+        // Arguments that get passed to `$op` after converting to a float
+        $($arg:expr),+
+        $(,)?
+    ) => {{
+        #[cfg($sys_available)]
+        let ret = {
+            type FloatTy = $float_ty;
+            $op( $($arg),+ )
+        };
+
+        #[cfg(not($sys_available))]
+        let ret = {
+            use rustc_apfloat::Float;
+            type FloatTy = rustc_apfloat::ieee::$apfloat_ty;
+
+            apfloat_fallback!(@inner
+                fty: $float_ty,
+                // Apply a conversion to `FloatTy` to each arg, then pass all args to `$op`
+                op_res: $op( $(FloatTy::from_bits($arg.to_bits().into())),+ ),
+                $(apfloat_op: $apfloat_op, )?
+                $(conv_opts: $convert,)?
+                args: $($arg),+
+            )
+        };
+
+        ret
+    }};
+
+    // Operations that do not need converting back to a float
+    (@inner fty: $float_ty:ty, op_res: $val:expr, conv_opts: no_convert, args: $($_arg:expr),+) => {
+        $val
+    };
+
+    // Some apfloat operations return a `StatusAnd` that we need to extract the value from. This
+    // is the default.
+    (@inner fty: $float_ty:ty, op_res: $val:expr, args: $($_arg:expr),+) => {{
+        // ignore the status, just get the value
+        let unwrapped = $val.value;
+
+        <$float_ty>::from_bits(FloatTy::to_bits(unwrapped).try_into().unwrap())
+    }};
+
+    // This is the case where we can't use the same expression for the default builtin and
+    // nonstandard apfloat fallback (e.g. `as` casts in std are normal functions in apfloat, so
+    // two separate expressions must be specified.
+    (@inner
+        fty: $float_ty:ty, op_res: $_val:expr,
+        apfloat_op: $apfloat_op:expr, args: $($arg:expr),+
+    ) => {{
+        $apfloat_op($($arg),+)
+    }};
+}
--- a/library/compiler-builtins/builtins-test/tests/addsub.rs
+++ b/library/compiler-builtins/builtins-test/tests/addsub.rs
@ -0,0 +1,143 @@
+#![allow(unused_macros)]
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::*;
+
+mod int_addsub {
+    use super::*;
+
+    macro_rules! sum {
+        ($($i:ty, $fn_add:ident, $fn_sub:ident);*;) => {
+            $(
+                #[test]
+                fn $fn_add() {
+                    use compiler_builtins::int::addsub::{$fn_add, $fn_sub};
+
+                    fuzz_2(N, |x: $i, y: $i| {
+                        let add0 = x.wrapping_add(y);
+                        let sub0 = x.wrapping_sub(y);
+                        let add1: $i = $fn_add(x, y);
+                        let sub1: $i = $fn_sub(x, y);
+                        if add0 != add1 {
+                            panic!(
+                                "{}({}, {}): std: {}, builtins: {}",
+                                stringify!($fn_add), x, y, add0, add1
+                            );
+                        }
+                        if sub0 != sub1 {
+                            panic!(
+                                "{}({}, {}): std: {}, builtins: {}",
+                                stringify!($fn_sub), x, y, sub0, sub1
+                            );
+                        }
+                    });
+                }
+            )*
+        };
+    }
+
+    macro_rules! overflowing_sum {
+        ($($i:ty, $fn_add:ident, $fn_sub:ident);*;) => {
+            $(
+                #[test]
+                fn $fn_add() {
+                    use compiler_builtins::int::addsub::{$fn_add, $fn_sub};
+
+                    fuzz_2(N, |x: $i, y: $i| {
+                        let (add0, add_o0)= x.overflowing_add(y);
+                        let (sub0, sub_o0)= x.overflowing_sub(y);
+                        let mut add_o1 = 0;
+                        let mut sub_o1 = 0;
+                        let add1: $i = $fn_add(x, y, &mut add_o1);
+                        let sub1: $i = $fn_sub(x, y, &mut sub_o1);
+                        if add0 != add1 || i32::from(add_o0) != add_o1 {
+                            panic!(
+                                "{}({}, {}): std: {:?}, builtins: {:?}",
+                                stringify!($fn_add), x, y, (add0, add_o0) , (add1, add_o1)
+                            );
+                        }
+                        if sub0 != sub1 || i32::from(sub_o0) != sub_o1 {
+                            panic!(
+                                "{}({}, {}): std: {:?}, builtins: {:?}",
+                                stringify!($fn_sub), x, y, (sub0, sub_o0) , (sub1, sub_o1)
+                            );
+                        }
+                    });
+                }
+            )*
+        };
+    }
+
+    // Integer addition and subtraction is very simple, so 100 fuzzing passes should be plenty.
+    sum! {
+        u128, __rust_u128_add, __rust_u128_sub;
+        i128, __rust_i128_add, __rust_i128_sub;
+    }
+
+    overflowing_sum! {
+        u128, __rust_u128_addo, __rust_u128_subo;
+        i128, __rust_i128_addo, __rust_i128_subo;
+    }
+}
+
+macro_rules! float_sum {
+    ($($f:ty, $fn_add:ident, $fn_sub:ident, $apfloat_ty:ident, $sys_available:meta);*;) => {
+        $(
+            #[test]
+            fn $fn_add() {
+                use core::ops::{Add, Sub};
+                use compiler_builtins::float::{{add::$fn_add, sub::$fn_sub}, Float};
+
+                fuzz_float_2(N, |x: $f, y: $f| {
+                    let add0 = apfloat_fallback!($f, $apfloat_ty, $sys_available, Add::add, x, y);
+                    let sub0 = apfloat_fallback!($f, $apfloat_ty, $sys_available, Sub::sub, x, y);
+                    let add1: $f = $fn_add(x, y);
+                    let sub1: $f = $fn_sub(x, y);
+                    if !Float::eq_repr(add0, add1) {
+                        panic!(
+                            "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                            stringify!($fn_add), x, y, add0, add1
+                        );
+                    }
+                    if !Float::eq_repr(sub0, sub1) {
+                        panic!(
+                            "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                            stringify!($fn_sub), x, y, sub0, sub1
+                        );
+                    }
+                });
+            }
+        )*
+    }
+}
+
+#[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
+mod float_addsub {
+    use super::*;
+
+    float_sum! {
+        f32, __addsf3, __subsf3, Single, all();
+        f64, __adddf3, __subdf3, Double, all();
+    }
+}
+
+#[cfg(f128_enabled)]
+#[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
+#[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+mod float_addsub_f128 {
+    use super::*;
+
+    float_sum! {
+        f128, __addtf3, __subtf3, Quad, not(feature = "no-sys-f128");
+    }
+}
+
+#[cfg(f128_enabled)]
+#[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+mod float_addsub_f128_ppc {
+    use super::*;
+
+    float_sum! {
+        f128, __addkf3, __subkf3, Quad, not(feature = "no-sys-f128");
+    }
+}
--- a/library/compiler-builtins/builtins-test/tests/aeabi_memclr.rs
+++ b/library/compiler-builtins/builtins-test/tests/aeabi_memclr.rs
@ -0,0 +1,60 @@
+#![cfg(all(
+    target_arch = "arm",
+    not(any(target_env = "gnu", target_env = "musl")),
+    target_os = "linux",
+    feature = "mem"
+))]
+#![feature(compiler_builtins_lib)]
+#![no_std]
+
+extern crate compiler_builtins;
+
+// test runner
+extern crate utest_cortex_m_qemu;
+
+// overrides `panic!`
+#[macro_use]
+extern crate utest_macros;
+
+use core::mem;
+
+macro_rules! panic {
+    ($($tt:tt)*) => {
+        upanic!($($tt)*);
+    };
+}
+
+extern "C" {
+    fn __aeabi_memclr4(dest: *mut u8, n: usize);
+    fn __aeabi_memset4(dest: *mut u8, n: usize, c: u32);
+}
+
+struct Aligned {
+    array: [u8; 8],
+    _alignment: [u32; 0],
+}
+
+impl Aligned {
+    fn new() -> Self {
+        Aligned {
+            array: [0; 8],
+            _alignment: [],
+        }
+    }
+}
+
+#[test]
+fn memclr4() {
+    let mut aligned = Aligned::new();
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+
+    for n in 0..9 {
+        unsafe {
+            __aeabi_memset4(xs.as_mut_ptr(), n, 0xff);
+            __aeabi_memclr4(xs.as_mut_ptr(), n);
+        }
+
+        assert!(xs[0..n].iter().all(|x| *x == 0));
+    }
+}
--- a/library/compiler-builtins/builtins-test/tests/aeabi_memcpy.rs
+++ b/library/compiler-builtins/builtins-test/tests/aeabi_memcpy.rs
@ -0,0 +1,71 @@
+#![cfg(all(
+    target_arch = "arm",
+    not(any(target_env = "gnu", target_env = "musl")),
+    target_os = "linux",
+    feature = "mem"
+))]
+#![feature(compiler_builtins_lib)]
+#![no_std]
+
+extern crate compiler_builtins;
+
+// test runner
+extern crate utest_cortex_m_qemu;
+
+// overrides `panic!`
+#[macro_use]
+extern crate utest_macros;
+
+macro_rules! panic {
+    ($($tt:tt)*) => {
+        upanic!($($tt)*);
+    };
+}
+
+extern "C" {
+    fn __aeabi_memcpy(dest: *mut u8, src: *const u8, n: usize);
+    fn __aeabi_memcpy4(dest: *mut u8, src: *const u8, n: usize);
+}
+
+struct Aligned {
+    array: [u8; 8],
+    _alignment: [u32; 0],
+}
+
+impl Aligned {
+    fn new(array: [u8; 8]) -> Self {
+        Aligned {
+            array: array,
+            _alignment: [],
+        }
+    }
+}
+
+#[test]
+fn memcpy() {
+    let mut dest = [0; 4];
+    let src = [0xde, 0xad, 0xbe, 0xef];
+
+    for n in 0..dest.len() {
+        dest.copy_from_slice(&[0; 4]);
+
+        unsafe { __aeabi_memcpy(dest.as_mut_ptr(), src.as_ptr(), n) }
+
+        assert_eq!(&dest[0..n], &src[0..n])
+    }
+}
+
+#[test]
+fn memcpy4() {
+    let mut aligned = Aligned::new([0; 8]);
+    let dest = &mut aligned.array;
+    let src = [0xde, 0xad, 0xbe, 0xef, 0xba, 0xad, 0xf0, 0x0d];
+
+    for n in 0..dest.len() {
+        dest.copy_from_slice(&[0; 8]);
+
+        unsafe { __aeabi_memcpy4(dest.as_mut_ptr(), src.as_ptr(), n) }
+
+        assert_eq!(&dest[0..n], &src[0..n])
+    }
+}
--- a/library/compiler-builtins/builtins-test/tests/aeabi_memset.rs
+++ b/library/compiler-builtins/builtins-test/tests/aeabi_memset.rs
@ -0,0 +1,240 @@
+#![cfg(all(
+    target_arch = "arm",
+    not(any(target_env = "gnu", target_env = "musl")),
+    target_os = "linux",
+    feature = "mem"
+))]
+#![feature(compiler_builtins_lib)]
+#![no_std]
+
+extern crate compiler_builtins;
+
+// test runner
+extern crate utest_cortex_m_qemu;
+
+// overrides `panic!`
+#[macro_use]
+extern crate utest_macros;
+
+use core::mem;
+
+macro_rules! panic {
+    ($($tt:tt)*) => {
+        upanic!($($tt)*);
+    };
+}
+
+extern "C" {
+    fn __aeabi_memset4(dest: *mut u8, n: usize, c: u32);
+}
+
+struct Aligned {
+    array: [u8; 8],
+    _alignment: [u32; 0],
+}
+
+impl Aligned {
+    fn new(array: [u8; 8]) -> Self {
+        Aligned {
+            array: array,
+            _alignment: [],
+        }
+    }
+}
+
+#[test]
+fn zero() {
+    let mut aligned = Aligned::new([0u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), 0, c) }
+
+    assert_eq!(*xs, [0; 8]);
+
+    let mut aligned = Aligned::new([1u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), 0, c) }
+
+    assert_eq!(*xs, [1; 8]);
+}
+
+#[test]
+fn one() {
+    let mut aligned = Aligned::new([0u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let n = 1;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0, 0, 0, 0, 0, 0, 0]);
+
+    let mut aligned = Aligned::new([1u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 1, 1, 1, 1, 1, 1, 1]);
+}
+
+#[test]
+fn two() {
+    let mut aligned = Aligned::new([0u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let n = 2;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0, 0, 0, 0, 0, 0]);
+
+    let mut aligned = Aligned::new([1u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 1, 1, 1, 1, 1, 1]);
+}
+
+#[test]
+fn three() {
+    let mut aligned = Aligned::new([0u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let n = 3;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0, 0, 0, 0, 0]);
+
+    let mut aligned = Aligned::new([1u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 1, 1, 1, 1, 1]);
+}
+
+#[test]
+fn four() {
+    let mut aligned = Aligned::new([0u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let n = 4;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 0, 0, 0, 0]);
+
+    let mut aligned = Aligned::new([1u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 1, 1, 1, 1]);
+}
+
+#[test]
+fn five() {
+    let mut aligned = Aligned::new([0u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let n = 5;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 0xef, 0, 0, 0]);
+
+    let mut aligned = Aligned::new([1u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 0xef, 1, 1, 1]);
+}
+
+#[test]
+fn six() {
+    let mut aligned = Aligned::new([0u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let n = 6;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 0, 0]);
+
+    let mut aligned = Aligned::new([1u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 1, 1]);
+}
+
+#[test]
+fn seven() {
+    let mut aligned = Aligned::new([0u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let n = 7;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 0]);
+
+    let mut aligned = Aligned::new([1u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 1]);
+}
+
+#[test]
+fn eight() {
+    let mut aligned = Aligned::new([0u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let n = 8;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 0xef]);
+
+    let mut aligned = Aligned::new([1u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 0xef]);
+}
--- a/library/compiler-builtins/builtins-test/tests/big.rs
+++ b/library/compiler-builtins/builtins-test/tests/big.rs
@ -0,0 +1,134 @@
+use compiler_builtins::int::{HInt, MinInt, i256, u256};
+
+const LOHI_SPLIT: u128 = 0xaaaaaaaaaaaaaaaaffffffffffffffff;
+
+/// Print a `u256` as hex since we can't add format implementations
+fn hexu(v: u256) -> String {
+    format!(
+        "0x{:016x}{:016x}{:016x}{:016x}",
+        v.0[3], v.0[2], v.0[1], v.0[0]
+    )
+}
+
+#[test]
+fn widen_u128() {
+    assert_eq!(u128::MAX.widen(), u256([u64::MAX, u64::MAX, 0, 0]));
+    assert_eq!(
+        LOHI_SPLIT.widen(),
+        u256([u64::MAX, 0xaaaaaaaaaaaaaaaa, 0, 0])
+    );
+}
+
+#[test]
+fn widen_i128() {
+    assert_eq!((-1i128).widen(), u256::MAX.signed());
+    assert_eq!(
+        (LOHI_SPLIT as i128).widen(),
+        i256([u64::MAX, 0xaaaaaaaaaaaaaaaa, u64::MAX, u64::MAX])
+    );
+    assert_eq!((-1i128).zero_widen().unsigned(), (u128::MAX).widen());
+}
+
+#[test]
+fn widen_mul_u128() {
+    let tests = [
+        (u128::MAX / 2, 2_u128, u256([u64::MAX - 1, u64::MAX, 0, 0])),
+        (u128::MAX, 2_u128, u256([u64::MAX - 1, u64::MAX, 1, 0])),
+        (u128::MAX, u128::MAX, u256([1, 0, u64::MAX - 1, u64::MAX])),
+        (u128::MIN, u128::MIN, u256::ZERO),
+        (1234, 0, u256::ZERO),
+        (0, 1234, u256::ZERO),
+    ];
+
+    let mut errors = Vec::new();
+    for (i, (a, b, exp)) in tests.iter().copied().enumerate() {
+        let res = a.widen_mul(b);
+        let res_z = a.zero_widen_mul(b);
+        assert_eq!(res, res_z);
+        if res != exp {
+            errors.push((i, a, b, exp, res));
+        }
+    }
+
+    for (i, a, b, exp, res) in &errors {
+        eprintln!(
+            "FAILURE ({i}): {a:#034x} * {b:#034x} = {} got {}",
+            hexu(*exp),
+            hexu(*res)
+        );
+    }
+    assert!(errors.is_empty());
+}
+
+#[test]
+fn not_u128() {
+    assert_eq!(!u256::ZERO, u256::MAX);
+}
+
+#[test]
+fn shr_u128() {
+    let only_low = [
+        1,
+        u16::MAX.into(),
+        u32::MAX.into(),
+        u64::MAX.into(),
+        u128::MAX,
+    ];
+
+    let mut errors = Vec::new();
+
+    for a in only_low {
+        for perturb in 0..10 {
+            let a = a.saturating_add(perturb);
+            for shift in 0..128 {
+                let res = a.widen() >> shift;
+                let expected = (a >> shift).widen();
+                if res != expected {
+                    errors.push((a.widen(), shift, res, expected));
+                }
+            }
+        }
+    }
+
+    let check = [
+        (
+            u256::MAX,
+            1,
+            u256([u64::MAX, u64::MAX, u64::MAX, u64::MAX >> 1]),
+        ),
+        (
+            u256::MAX,
+            5,
+            u256([u64::MAX, u64::MAX, u64::MAX, u64::MAX >> 5]),
+        ),
+        (u256::MAX, 63, u256([u64::MAX, u64::MAX, u64::MAX, 1])),
+        (u256::MAX, 64, u256([u64::MAX, u64::MAX, u64::MAX, 0])),
+        (u256::MAX, 65, u256([u64::MAX, u64::MAX, u64::MAX >> 1, 0])),
+        (u256::MAX, 127, u256([u64::MAX, u64::MAX, 1, 0])),
+        (u256::MAX, 128, u256([u64::MAX, u64::MAX, 0, 0])),
+        (u256::MAX, 129, u256([u64::MAX, u64::MAX >> 1, 0, 0])),
+        (u256::MAX, 191, u256([u64::MAX, 1, 0, 0])),
+        (u256::MAX, 192, u256([u64::MAX, 0, 0, 0])),
+        (u256::MAX, 193, u256([u64::MAX >> 1, 0, 0, 0])),
+        (u256::MAX, 191, u256([u64::MAX, 1, 0, 0])),
+        (u256::MAX, 254, u256([0b11, 0, 0, 0])),
+        (u256::MAX, 255, u256([1, 0, 0, 0])),
+    ];
+
+    for (input, shift, expected) in check {
+        let res = input >> shift;
+        if res != expected {
+            errors.push((input, shift, res, expected));
+        }
+    }
+
+    for (a, b, res, expected) in &errors {
+        eprintln!(
+            "FAILURE: {} >> {b} = {} got {}",
+            hexu(*a),
+            hexu(*expected),
+            hexu(*res),
+        );
+    }
+    assert!(errors.is_empty());
+}
--- a/library/compiler-builtins/builtins-test/tests/cmp.rs
+++ b/library/compiler-builtins/builtins-test/tests/cmp.rs
@ -0,0 +1,184 @@
+#![allow(unused_macros)]
+#![allow(unreachable_code)]
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::*;
+
+mod float_comparisons {
+    use super::*;
+
+    macro_rules! cmp {
+        (
+            $f:ty, $x:ident, $y:ident, $apfloat_ty:ident, $sys_available:meta,
+            $($unordered_val:expr, $fn:ident);*;
+        ) => {
+            $(
+                let cmp0 = if apfloat_fallback!(
+                        $f, $apfloat_ty, $sys_available,
+                        |x: FloatTy| x.is_nan() => no_convert,
+                        $x
+                    ) || apfloat_fallback!(
+                        $f, $apfloat_ty, $sys_available,
+                        |y: FloatTy| y.is_nan() => no_convert,
+                        $y
+                    )
+                {
+                    $unordered_val
+                } else if apfloat_fallback!(
+                    $f, $apfloat_ty, $sys_available,
+                    |x, y| x < y => no_convert,
+                    $x, $y
+                ) {
+                    -1
+                } else if apfloat_fallback!(
+                    $f, $apfloat_ty, $sys_available,
+                    |x, y| x == y => no_convert,
+                    $x, $y
+                ) {
+                    0
+                } else {
+                    1
+                };
+
+                let cmp1 = $fn($x, $y);
+                if cmp0 != cmp1 {
+                    panic!(
+                        "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                        stringify!($fn), $x, $y, cmp0, cmp1
+                    );
+                }
+            )*
+        };
+    }
+
+    #[test]
+    fn cmp_f32() {
+        use compiler_builtins::float::cmp::{
+            __eqsf2, __gesf2, __gtsf2, __lesf2, __ltsf2, __nesf2, __unordsf2,
+        };
+
+        fuzz_float_2(N, |x: f32, y: f32| {
+            assert_eq!(__unordsf2(x, y) != 0, x.is_nan() || y.is_nan());
+            cmp!(f32, x, y, Single, all(),
+                1, __ltsf2;
+                1, __lesf2;
+                1, __eqsf2;
+                -1, __gesf2;
+                -1, __gtsf2;
+                1, __nesf2;
+            );
+        });
+    }
+
+    #[test]
+    fn cmp_f64() {
+        use compiler_builtins::float::cmp::{
+            __eqdf2, __gedf2, __gtdf2, __ledf2, __ltdf2, __nedf2, __unorddf2,
+        };
+
+        fuzz_float_2(N, |x: f64, y: f64| {
+            assert_eq!(__unorddf2(x, y) != 0, x.is_nan() || y.is_nan());
+            cmp!(f64, x, y, Double, all(),
+                1, __ltdf2;
+                1, __ledf2;
+                1, __eqdf2;
+                -1, __gedf2;
+                -1, __gtdf2;
+                1, __nedf2;
+            );
+        });
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn cmp_f128() {
+        #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+        use compiler_builtins::float::cmp::{
+            __eqkf2 as __eqtf2, __gekf2 as __getf2, __gtkf2 as __gttf2, __lekf2 as __letf2,
+            __ltkf2 as __lttf2, __nekf2 as __netf2, __unordkf2 as __unordtf2,
+        };
+        #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+        use compiler_builtins::float::cmp::{
+            __eqtf2, __getf2, __gttf2, __letf2, __lttf2, __netf2, __unordtf2,
+        };
+
+        fuzz_float_2(N, |x: f128, y: f128| {
+            let x_is_nan = apfloat_fallback!(
+                f128, Quad, not(feature = "no-sys-f128"),
+                |x: FloatTy| x.is_nan() => no_convert,
+                x
+            );
+            let y_is_nan = apfloat_fallback!(
+                f128, Quad, not(feature = "no-sys-f128"),
+                |x: FloatTy| x.is_nan() => no_convert,
+                y
+            );
+
+            assert_eq!(__unordtf2(x, y) != 0, x_is_nan || y_is_nan);
+
+            cmp!(f128, x, y, Quad, not(feature = "no-sys-f128"),
+                1, __lttf2;
+                1, __letf2;
+                1, __eqtf2;
+                -1, __getf2;
+                -1, __gttf2;
+                1, __netf2;
+            );
+        });
+    }
+}
+
+#[cfg(target_arch = "arm")]
+mod float_comparisons_arm {
+    use super::*;
+
+    macro_rules! cmp2 {
+        ($x:ident, $y:ident, $($unordered_val:expr, $fn_std:expr, $fn_builtins:ident);*;) => {
+            $(
+                let cmp0: i32 = if $x.is_nan() || $y.is_nan() {
+                    $unordered_val
+                } else {
+                    $fn_std as i32
+                };
+                let cmp1: i32 = $fn_builtins($x, $y);
+                if cmp0 != cmp1 {
+                    panic!("{}({}, {}): std: {}, builtins: {}", stringify!($fn_builtins), $x, $y, cmp0, cmp1);
+                }
+            )*
+        };
+    }
+
+    #[test]
+    fn cmp_f32() {
+        use compiler_builtins::float::cmp::{
+            __aeabi_fcmpeq, __aeabi_fcmpge, __aeabi_fcmpgt, __aeabi_fcmple, __aeabi_fcmplt,
+        };
+
+        fuzz_float_2(N, |x: f32, y: f32| {
+            cmp2!(x, y,
+                0, x < y, __aeabi_fcmplt;
+                0, x <= y, __aeabi_fcmple;
+                0, x == y, __aeabi_fcmpeq;
+                0, x >= y, __aeabi_fcmpge;
+                0, x > y, __aeabi_fcmpgt;
+            );
+        });
+    }
+
+    #[test]
+    fn cmp_f64() {
+        use compiler_builtins::float::cmp::{
+            __aeabi_dcmpeq, __aeabi_dcmpge, __aeabi_dcmpgt, __aeabi_dcmple, __aeabi_dcmplt,
+        };
+
+        fuzz_float_2(N, |x: f64, y: f64| {
+            cmp2!(x, y,
+                0, x < y, __aeabi_dcmplt;
+                0, x <= y, __aeabi_dcmple;
+                0, x == y, __aeabi_dcmpeq;
+                0, x >= y, __aeabi_dcmpge;
+                0, x > y, __aeabi_dcmpgt;
+            );
+        });
+    }
+}
--- a/library/compiler-builtins/builtins-test/tests/conv.rs
+++ b/library/compiler-builtins/builtins-test/tests/conv.rs
@ -0,0 +1,364 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+#![cfg_attr(f16_enabled, feature(f16))]
+// makes configuration easier
+#![allow(unused_macros)]
+#![allow(unused_imports)]
+
+use builtins_test::*;
+use compiler_builtins::float::Float;
+use rustc_apfloat::{Float as _, FloatConvert as _};
+
+mod i_to_f {
+    use super::*;
+
+    macro_rules! i_to_f {
+        ($f_ty:ty, $apfloat_ty:ident, $sys_available:meta, $($i_ty:ty, $fn:ident);*;) => {
+            $(
+                #[test]
+                fn $fn() {
+                    use compiler_builtins::float::conv::$fn;
+                    use compiler_builtins::int::Int;
+
+                    fuzz(N, |x: $i_ty| {
+                        let f0 = apfloat_fallback!(
+                            $f_ty, $apfloat_ty, $sys_available,
+                            |x| x as $f_ty;
+                            // When the builtin is not available, we need to use a different conversion
+                            // method (since apfloat doesn't support `as` casting).
+                            |x: $i_ty| {
+                                use compiler_builtins::int::MinInt;
+
+                                let apf = if <$i_ty>::SIGNED {
+                                    FloatTy::from_i128(x.try_into().unwrap()).value
+                                } else {
+                                    FloatTy::from_u128(x.try_into().unwrap()).value
+                                };
+
+                                <$f_ty>::from_bits(apf.to_bits())
+                            },
+                            x
+                        );
+                        let f1: $f_ty = $fn(x);
+
+                        #[cfg($sys_available)] {
+                            // This makes sure that the conversion produced the best rounding possible, and does
+                            // this independent of `x as $into` rounding correctly.
+                            // This assumes that float to integer conversion is correct.
+                            let y_minus_ulp = <$f_ty>::from_bits(f1.to_bits().wrapping_sub(1)) as $i_ty;
+                            let y = f1 as $i_ty;
+                            let y_plus_ulp = <$f_ty>::from_bits(f1.to_bits().wrapping_add(1)) as $i_ty;
+                            let error_minus = <$i_ty as Int>::abs_diff(y_minus_ulp, x);
+                            let error = <$i_ty as Int>::abs_diff(y, x);
+                            let error_plus = <$i_ty as Int>::abs_diff(y_plus_ulp, x);
+
+                            // The first two conditions check that none of the two closest float values are
+                            // strictly closer in representation to `x`. The second makes sure that rounding is
+                            // towards even significand if two float values are equally close to the integer.
+                            if error_minus < error
+                                || error_plus < error
+                                || ((error_minus == error || error_plus == error)
+                                    && ((f0.to_bits() & 1) != 0))
+                            {
+                                if !cfg!(any(
+                                    target_arch = "powerpc",
+                                    target_arch = "powerpc64"
+                                )) {
+                                    panic!(
+                                        "incorrect rounding by {}({}): {}, ({}, {}, {}), errors ({}, {}, {})",
+                                        stringify!($fn),
+                                        x,
+                                        f1.to_bits(),
+                                        y_minus_ulp,
+                                        y,
+                                        y_plus_ulp,
+                                        error_minus,
+                                        error,
+                                        error_plus,
+                                    );
+                                }
+                            }
+                        }
+
+                        // Test against native conversion. We disable testing on all `x86` because of
+                        // rounding bugs with `i686`. `powerpc` also has the same rounding bug.
+                        if !Float::eq_repr(f0, f1) && !cfg!(any(
+                            target_arch = "x86",
+                            target_arch = "powerpc",
+                            target_arch = "powerpc64"
+                        )) {
+                            panic!(
+                                "{}({}): std: {:?}, builtins: {:?}",
+                                stringify!($fn),
+                                x,
+                                f0,
+                                f1,
+                            );
+                        }
+                    });
+                }
+            )*
+        };
+    }
+
+    i_to_f! { f32, Single, all(),
+        u32, __floatunsisf;
+        i32, __floatsisf;
+        u64, __floatundisf;
+        i64, __floatdisf;
+        u128, __floatuntisf;
+        i128, __floattisf;
+    }
+
+    i_to_f! { f64, Double, all(),
+        u32, __floatunsidf;
+        i32, __floatsidf;
+        u64, __floatundidf;
+        i64, __floatdidf;
+        u128, __floatuntidf;
+        i128, __floattidf;
+    }
+
+    #[cfg(not(feature = "no-f16-f128"))]
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    i_to_f! { f128, Quad, not(feature = "no-sys-f128-int-convert"),
+        u32, __floatunsitf;
+        i32, __floatsitf;
+        u64, __floatunditf;
+        i64, __floatditf;
+        u128, __floatuntitf;
+        i128, __floattitf;
+    }
+
+    #[cfg(not(feature = "no-f16-f128"))]
+    #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+    i_to_f! { f128, Quad, not(feature = "no-sys-f128-int-convert"),
+        u32, __floatunsikf;
+        i32, __floatsikf;
+        u64, __floatundikf;
+        i64, __floatdikf;
+        u128, __floatuntikf;
+        i128, __floattikf;
+    }
+}
+
+mod f_to_i {
+    use super::*;
+
+    macro_rules! f_to_i {
+        ($x:ident, $f_ty:ty, $apfloat_ty:ident, $sys_available:meta, $($i_ty:ty, $fn:ident);*;) => {
+            $(
+                // it is undefined behavior in the first place to do conversions with NaNs
+                if !apfloat_fallback!(
+                    $f_ty, $apfloat_ty, $sys_available, |x: FloatTy| x.is_nan() => no_convert, $x
+                ) {
+                    let conv0 = apfloat_fallback!(
+                        $f_ty, $apfloat_ty, $sys_available,
+                        // Use an `as` cast when the builtin is available on the system.
+                        |x| x as $i_ty;
+                        // When the builtin is not available, we need to use a different conversion
+                        // method (since apfloat doesn't support `as` casting).
+                        |x: $f_ty| {
+                            use compiler_builtins::int::MinInt;
+
+                            let apf = FloatTy::from_bits(x.to_bits().into());
+                            let bits: usize = <$i_ty>::BITS.try_into().unwrap();
+
+                            let err_fn = || panic!(
+                                "Unable to convert value {x:?} to type {}:", stringify!($i_ty)
+                            );
+
+                            if <$i_ty>::SIGNED {
+                               <$i_ty>::try_from(apf.to_i128(bits).value).ok().unwrap_or_else(err_fn)
+                            } else {
+                               <$i_ty>::try_from(apf.to_u128(bits).value).ok().unwrap_or_else(err_fn)
+                            }
+                        },
+                        $x
+                    );
+                    let conv1: $i_ty = $fn($x);
+                    if conv0 != conv1 {
+                        panic!("{}({:?}): std: {:?}, builtins: {:?}", stringify!($fn), $x, conv0, conv1);
+                    }
+                }
+            )*
+        };
+    }
+
+    #[test]
+    fn f32_to_int() {
+        use compiler_builtins::float::conv::{
+            __fixsfdi, __fixsfsi, __fixsfti, __fixunssfdi, __fixunssfsi, __fixunssfti,
+        };
+
+        fuzz_float(N, |x: f32| {
+            f_to_i!(x, f32, Single, all(),
+                u32, __fixunssfsi;
+                u64, __fixunssfdi;
+                u128, __fixunssfti;
+                i32, __fixsfsi;
+                i64, __fixsfdi;
+                i128, __fixsfti;
+            );
+        });
+    }
+
+    #[test]
+    fn f64_to_int() {
+        use compiler_builtins::float::conv::{
+            __fixdfdi, __fixdfsi, __fixdfti, __fixunsdfdi, __fixunsdfsi, __fixunsdfti,
+        };
+
+        fuzz_float(N, |x: f64| {
+            f_to_i!(x, f64, Double, all(),
+                u32, __fixunsdfsi;
+                u64, __fixunsdfdi;
+                u128, __fixunsdfti;
+                i32, __fixdfsi;
+                i64, __fixdfdi;
+                i128, __fixdfti;
+            );
+        });
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn f128_to_int() {
+        #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+        use compiler_builtins::float::conv::{
+            __fixkfdi as __fixtfdi, __fixkfsi as __fixtfsi, __fixkfti as __fixtfti,
+            __fixunskfdi as __fixunstfdi, __fixunskfsi as __fixunstfsi,
+            __fixunskfti as __fixunstfti,
+        };
+        #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+        use compiler_builtins::float::conv::{
+            __fixtfdi, __fixtfsi, __fixtfti, __fixunstfdi, __fixunstfsi, __fixunstfti,
+        };
+
+        fuzz_float(N, |x: f128| {
+            f_to_i!(
+                x,
+                f128,
+                Quad,
+                not(feature = "no-sys-f128-int-convert"),
+                u32, __fixunstfsi;
+                u64, __fixunstfdi;
+                u128, __fixunstfti;
+                i32, __fixtfsi;
+                i64, __fixtfdi;
+                i128, __fixtfti;
+            );
+        });
+    }
+}
+
+macro_rules! f_to_f {
+    (
+        $mod:ident,
+        $(
+            $from_ty:ty => $to_ty:ty,
+            $from_ap_ty:ident => $to_ap_ty:ident,
+            $fn:ident, $sys_available:meta
+        );+;
+    ) => {$(
+        #[test]
+        fn $fn() {
+            use compiler_builtins::float::{$mod::$fn, Float};
+            use rustc_apfloat::ieee::{$from_ap_ty, $to_ap_ty};
+
+            fuzz_float(N, |x: $from_ty| {
+                let tmp0: $to_ty = apfloat_fallback!(
+                    $from_ty,
+                    $from_ap_ty,
+                    $sys_available,
+                    |x: $from_ty| x as $to_ty;
+                    |x: $from_ty| {
+                        let from_apf = FloatTy::from_bits(x.to_bits().into());
+                        // Get `value` directly to ignore INVALID_OP
+                        let to_apf: $to_ap_ty = from_apf.convert(&mut false).value;
+                        <$to_ty>::from_bits(to_apf.to_bits().try_into().unwrap())
+                    },
+                    x
+                );
+                let tmp1: $to_ty = $fn(x);
+
+                if !Float::eq_repr(tmp0, tmp1) {
+                    panic!(
+                        "{}({:?}): std: {:?}, builtins: {:?}",
+                        stringify!($fn),
+                        x,
+                        tmp0,
+                        tmp1
+                    );
+                }
+            })
+        }
+    )+};
+}
+
+mod extend {
+    use super::*;
+
+    f_to_f! {
+        extend,
+        f32 => f64, Single => Double, __extendsfdf2, all();
+    }
+
+    #[cfg(all(f16_enabled, f128_enabled))]
+    #[cfg(not(any(
+        target_arch = "powerpc",
+        target_arch = "powerpc64",
+        target_arch = "loongarch64"
+    )))]
+    f_to_f! {
+        extend,
+        f16 => f32, Half => Single, __extendhfsf2, not(feature = "no-sys-f16");
+        f16 => f32, Half => Single, __gnu_h2f_ieee, not(feature = "no-sys-f16");
+        f16 => f64, Half => Double, __extendhfdf2, not(feature = "no-sys-f16-f64-convert");
+        f16 => f128, Half => Quad, __extendhftf2, not(feature = "no-sys-f16-f128-convert");
+        f32 => f128, Single => Quad, __extendsftf2, not(feature = "no-sys-f128");
+        f64 => f128, Double => Quad, __extenddftf2, not(feature = "no-sys-f128");
+    }
+
+    #[cfg(f128_enabled)]
+    #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+    f_to_f! {
+        extend,
+        // FIXME(#655): `f16` tests disabled until we can bootstrap symbols
+        f32 => f128, Single => Quad, __extendsfkf2, not(feature = "no-sys-f128");
+        f64 => f128, Double => Quad, __extenddfkf2, not(feature = "no-sys-f128");
+    }
+}
+
+mod trunc {
+    use super::*;
+
+    f_to_f! {
+        trunc,
+        f64 => f32, Double => Single, __truncdfsf2, all();
+    }
+
+    #[cfg(all(f16_enabled, f128_enabled))]
+    #[cfg(not(any(
+        target_arch = "powerpc",
+        target_arch = "powerpc64",
+        target_arch = "loongarch64"
+    )))]
+    f_to_f! {
+        trunc,
+        f32 => f16, Single => Half, __truncsfhf2, not(feature = "no-sys-f16");
+        f32 => f16, Single => Half, __gnu_f2h_ieee, not(feature = "no-sys-f16");
+        f64 => f16, Double => Half, __truncdfhf2, not(feature = "no-sys-f16-f64-convert");
+        f128 => f16, Quad => Half, __trunctfhf2, not(feature = "no-sys-f16-f128-convert");
+        f128 => f32, Quad => Single, __trunctfsf2, not(feature = "no-sys-f128");
+        f128 => f64, Quad => Double, __trunctfdf2, not(feature = "no-sys-f128");
+    }
+
+    #[cfg(f128_enabled)]
+    #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+    f_to_f! {
+        trunc,
+        // FIXME(#655): `f16` tests disabled until we can bootstrap symbols
+        f128 => f32, Quad => Single, __trunckfsf2, not(feature = "no-sys-f128");
+        f128 => f64, Quad => Double, __trunckfdf2, not(feature = "no-sys-f128");
+    }
+}
--- a/library/compiler-builtins/builtins-test/tests/div_rem.rs
+++ b/library/compiler-builtins/builtins-test/tests/div_rem.rs
@ -0,0 +1,164 @@
+#![feature(f128)]
+#![allow(unused_macros)]
+
+use builtins_test::*;
+use compiler_builtins::int::sdiv::{__divmoddi4, __divmodsi4, __divmodti4};
+use compiler_builtins::int::udiv::{__udivmoddi4, __udivmodsi4, __udivmodti4, u128_divide_sparc};
+
+// Division algorithms have by far the nastiest and largest number of edge cases, and experience shows
+// that sometimes 100_000 iterations of the random fuzzer is needed.
+
+/// Creates intensive test functions for division functions of a certain size
+macro_rules! test {
+    (
+        $n:expr, // the number of bits in a $iX or $uX
+        $uX:ident, // unsigned integer that will be shifted
+        $iX:ident, // signed version of $uX
+        $test_name:ident, // name of the test function
+        $unsigned_name:ident, // unsigned division function
+        $signed_name:ident // signed division function
+    ) => {
+        #[test]
+        fn $test_name() {
+            fuzz_2(N, |lhs, rhs| {
+                if rhs == 0 {
+                    return;
+                }
+
+                let mut rem: $uX = 0;
+                let quo: $uX = $unsigned_name(lhs, rhs, Some(&mut rem));
+                if rhs <= rem || (lhs != rhs.wrapping_mul(quo).wrapping_add(rem)) {
+                    panic!(
+                        "unsigned division function failed with lhs:{} rhs:{} \
+                        std:({}, {}) builtins:({}, {})",
+                        lhs,
+                        rhs,
+                        lhs.wrapping_div(rhs),
+                        lhs.wrapping_rem(rhs),
+                        quo,
+                        rem
+                    );
+                }
+
+                // test the signed division function also
+                let lhs = lhs as $iX;
+                let rhs = rhs as $iX;
+                let mut rem: $iX = 0;
+                let quo: $iX = $signed_name(lhs, rhs, &mut rem);
+                // We cannot just test that
+                // `lhs == rhs.wrapping_mul(quo).wrapping_add(rem)`, but also
+                // need to make sure the remainder isn't larger than the divisor
+                // and has the correct sign.
+                let incorrect_rem = if rem == 0 {
+                    false
+                } else if rhs == $iX::MIN {
+                    // `rhs.wrapping_abs()` would overflow, so handle this case
+                    // separately.
+                    (lhs.is_negative() != rem.is_negative()) || (rem == $iX::MIN)
+                } else {
+                    (lhs.is_negative() != rem.is_negative())
+                        || (rhs.wrapping_abs() <= rem.wrapping_abs())
+                };
+                if incorrect_rem || lhs != rhs.wrapping_mul(quo).wrapping_add(rem) {
+                    panic!(
+                        "signed division function failed with lhs:{} rhs:{} \
+                        std:({}, {}) builtins:({}, {})",
+                        lhs,
+                        rhs,
+                        lhs.wrapping_div(rhs),
+                        lhs.wrapping_rem(rhs),
+                        quo,
+                        rem
+                    );
+                }
+            });
+        }
+    };
+}
+
+test!(32, u32, i32, div_rem_si4, __udivmodsi4, __divmodsi4);
+test!(64, u64, i64, div_rem_di4, __udivmoddi4, __divmoddi4);
+test!(128, u128, i128, div_rem_ti4, __udivmodti4, __divmodti4);
+
+#[test]
+fn divide_sparc() {
+    fuzz_2(N, |lhs, rhs| {
+        if rhs == 0 {
+            return;
+        }
+
+        let mut rem: u128 = 0;
+        let quo: u128 = u128_divide_sparc(lhs, rhs, &mut rem);
+        if rhs <= rem || (lhs != rhs.wrapping_mul(quo).wrapping_add(rem)) {
+            panic!(
+                "u128_divide_sparc({}, {}): \
+                std:({}, {}), builtins:({}, {})",
+                lhs,
+                rhs,
+                lhs.wrapping_div(rhs),
+                lhs.wrapping_rem(rhs),
+                quo,
+                rem
+            );
+        }
+    });
+}
+
+macro_rules! float {
+    ($($f:ty, $fn:ident, $apfloat_ty:ident, $sys_available:meta);*;) => {
+        $(
+            #[test]
+            fn $fn() {
+                use compiler_builtins::float::{div::$fn, Float};
+                use core::ops::Div;
+
+                fuzz_float_2(N, |x: $f, y: $f| {
+                    let quo0: $f = apfloat_fallback!($f, $apfloat_ty, $sys_available, Div::div, x, y);
+                    let quo1: $f = $fn(x, y);
+
+                    // ARM SIMD instructions always flush subnormals to zero
+                    if cfg!(target_arch = "arm") &&
+                        ((Float::is_subnormal(quo0)) || Float::is_subnormal(quo1)) {
+                        return;
+                    }
+
+                    if !Float::eq_repr(quo0, quo1) {
+                        panic!(
+                            "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                            stringify!($fn),
+                            x,
+                            y,
+                            quo0,
+                            quo1
+                        );
+                    }
+                });
+            }
+        )*
+    };
+}
+
+#[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
+mod float_div {
+    use super::*;
+
+    float! {
+        f32, __divsf3, Single, all();
+        f64, __divdf3, Double, all();
+    }
+
+    #[cfg(not(feature = "no-f16-f128"))]
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    float! {
+        f128, __divtf3, Quad,
+        // FIXME(llvm): there is a bug in LLVM rt.
+        // See <https://github.com/llvm/llvm-project/issues/91840>.
+        not(any(feature = "no-sys-f128", all(target_arch = "aarch64", target_os = "linux")));
+    }
+
+    #[cfg(not(feature = "no-f16-f128"))]
+    #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+    float! {
+        f128, __divkf3, Quad, not(feature = "no-sys-f128");
+    }
+}
--- a/library/compiler-builtins/builtins-test/tests/float_pow.rs
+++ b/library/compiler-builtins/builtins-test/tests/float_pow.rs
@ -0,0 +1,72 @@
+#![allow(unused_macros)]
+#![cfg_attr(f128_enabled, feature(f128))]
+#![cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
+
+use builtins_test::*;
+
+// This is approximate because of issues related to
+// https://github.com/rust-lang/rust/issues/73920.
+// TODO how do we resolve this indeterminacy?
+macro_rules! pow {
+    ($($f:ty, $tolerance:expr, $fn:ident, $sys_available:meta);*;) => {
+        $(
+            #[test]
+            // FIXME(apfloat): We skip tests if system symbols aren't available rather
+            // than providing a fallback, since `rustc_apfloat` does not provide `pow`.
+            #[cfg($sys_available)]
+            fn $fn() {
+                use compiler_builtins::float::pow::$fn;
+                use compiler_builtins::float::Float;
+                fuzz_float_2(N, |x: $f, y: $f| {
+                    if !(Float::is_subnormal(x) || Float::is_subnormal(y) || x.is_nan()) {
+                        let n = y.to_bits() & !<$f as Float>::SIG_MASK;
+                        let n = (n as <$f as Float>::SignedInt) >> <$f as Float>::SIG_BITS;
+                        let n = n as i32;
+                        let tmp0: $f = x.powi(n);
+                        let tmp1: $f = $fn(x, n);
+                        let (a, b) = if tmp0 < tmp1 {
+                            (tmp0, tmp1)
+                        } else {
+                            (tmp1, tmp0)
+                        };
+
+                        let good = if a == b {
+                            // handles infinity equality
+                            true
+                        } else if a < $tolerance {
+                            b < $tolerance
+                        } else {
+                            let quo = b / a;
+                            (quo < (1. + $tolerance)) && (quo > (1. - $tolerance))
+                        };
+
+                        assert!(
+                            good,
+                            "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                            stringify!($fn), x, n, tmp0, tmp1
+                        );
+                    }
+                });
+            }
+        )*
+    };
+}
+
+pow! {
+    f32, 1e-4, __powisf2, all();
+    f64, 1e-12, __powidf2, all();
+}
+
+#[cfg(f128_enabled)]
+// FIXME(f16_f128): MSVC cannot build these until `__divtf3` is available in nightly.
+#[cfg(not(target_env = "msvc"))]
+#[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+pow! {
+    f128, 1e-36, __powitf2, not(feature = "no-sys-f128");
+}
+
+#[cfg(f128_enabled)]
+#[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+pow! {
+    f128, 1e-36, __powikf2, not(feature = "no-sys-f128");
+}
--- a/library/compiler-builtins/builtins-test/tests/lse.rs
+++ b/library/compiler-builtins/builtins-test/tests/lse.rs
@ -0,0 +1,97 @@
+#![feature(decl_macro)] // so we can use pub(super)
+#![cfg(all(target_arch = "aarch64", target_os = "linux", not(feature = "no-asm")))]
+
+/// Translate a byte size to a Rust type.
+macro int_ty {
+    (1) => { i8 },
+    (2) => { i16 },
+    (4) => { i32 },
+    (8) => { i64 },
+    (16) => { i128 }
+}
+
+mod cas {
+    pub(super) macro test($_ordering:ident, $bytes:tt, $name:ident) {
+        #[test]
+        fn $name() {
+            builtins_test::fuzz_2(10000, |expected: super::int_ty!($bytes), new| {
+                let mut target = expected.wrapping_add(10);
+                assert_eq!(
+                    unsafe {
+                        compiler_builtins::aarch64_linux::$name::$name(expected, new, &mut target)
+                    },
+                    expected.wrapping_add(10),
+                    "return value should always be the previous value",
+                );
+                assert_eq!(
+                    target,
+                    expected.wrapping_add(10),
+                    "shouldn't have changed target"
+                );
+
+                target = expected;
+                assert_eq!(
+                    unsafe {
+                        compiler_builtins::aarch64_linux::$name::$name(expected, new, &mut target)
+                    },
+                    expected
+                );
+                assert_eq!(target, new, "should have updated target");
+            });
+        }
+    }
+}
+
+macro test_cas16($_ordering:ident, $name:ident) {
+    cas::test!($_ordering, 16, $name);
+}
+
+mod swap {
+    pub(super) macro test($_ordering:ident, $bytes:tt, $name:ident) {
+        #[test]
+        fn $name() {
+            builtins_test::fuzz_2(10000, |left: super::int_ty!($bytes), mut right| {
+                let orig_right = right;
+                assert_eq!(
+                    unsafe { compiler_builtins::aarch64_linux::$name::$name(left, &mut right) },
+                    orig_right
+                );
+                assert_eq!(left, right);
+            });
+        }
+    }
+}
+
+macro_rules! test_op {
+    ($mod:ident, $( $op:tt )* ) => {
+        mod $mod {
+            pub(super) macro test {
+                ($_ordering:ident, $bytes:tt, $name:ident) => {
+                    #[test]
+                    fn $name() {
+                        builtins_test::fuzz_2(10000, |old, val| {
+                            let mut target = old;
+                            let op: fn(super::int_ty!($bytes), super::int_ty!($bytes)) -> _ = $($op)*;
+                            let expected = op(old, val);
+                            assert_eq!(old, unsafe { compiler_builtins::aarch64_linux::$name::$name(val, &mut target) }, "{} should return original value", stringify!($name));
+                            assert_eq!(expected, target, "{} should store to target", stringify!($name));
+                        });
+                    }
+                }
+            }
+        }
+    };
+}
+
+test_op!(add, |left, right| left.wrapping_add(right));
+test_op!(clr, |left, right| left & !right);
+test_op!(xor, std::ops::BitXor::bitxor);
+test_op!(or, std::ops::BitOr::bitor);
+
+compiler_builtins::foreach_cas!(cas::test);
+compiler_builtins::foreach_cas16!(test_cas16);
+compiler_builtins::foreach_swp!(swap::test);
+compiler_builtins::foreach_ldadd!(add::test);
+compiler_builtins::foreach_ldclr!(clr::test);
+compiler_builtins::foreach_ldeor!(xor::test);
+compiler_builtins::foreach_ldset!(or::test);
--- a/library/compiler-builtins/builtins-test/tests/mem.rs
+++ b/library/compiler-builtins/builtins-test/tests/mem.rs
@ -0,0 +1,286 @@
+extern crate compiler_builtins;
+use compiler_builtins::mem::{memcmp, memcpy, memmove, memset};
+
+const WORD_SIZE: usize = core::mem::size_of::<usize>();
+
+#[test]
+fn memcpy_3() {
+    let mut arr: [u8; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
+    unsafe {
+        let src = arr.as_ptr().offset(9);
+        let dst = arr.as_mut_ptr().offset(1);
+        assert_eq!(memcpy(dst, src, 3), dst);
+        assert_eq!(arr, [0, 9, 10, 11, 4, 5, 6, 7, 8, 9, 10, 11]);
+    }
+    arr = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
+    unsafe {
+        let src = arr.as_ptr().offset(1);
+        let dst = arr.as_mut_ptr().offset(9);
+        assert_eq!(memcpy(dst, src, 3), dst);
+        assert_eq!(arr, [0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3]);
+    }
+}
+
+#[test]
+fn memcpy_10() {
+    let arr: [u8; 18] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17];
+    let mut dst: [u8; 12] = [0; 12];
+    unsafe {
+        let src = arr.as_ptr().offset(1);
+        assert_eq!(memcpy(dst.as_mut_ptr(), src, 10), dst.as_mut_ptr());
+        assert_eq!(dst, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 0]);
+    }
+    unsafe {
+        let src = arr.as_ptr().offset(8);
+        assert_eq!(memcpy(dst.as_mut_ptr(), src, 10), dst.as_mut_ptr());
+        assert_eq!(dst, [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 0, 0]);
+    }
+}
+
+#[test]
+fn memcpy_big() {
+    // Make the arrays cross 3 pages
+    const SIZE: usize = 8193;
+    let src: [u8; SIZE] = [22; SIZE];
+    struct Dst {
+        start: usize,
+        buf: [u8; SIZE],
+        end: usize,
+    }
+
+    let mut dst = Dst {
+        start: 0,
+        buf: [0; SIZE],
+        end: 0,
+    };
+    unsafe {
+        assert_eq!(
+            memcpy(dst.buf.as_mut_ptr(), src.as_ptr(), SIZE),
+            dst.buf.as_mut_ptr()
+        );
+        assert_eq!(dst.start, 0);
+        assert_eq!(dst.buf, [22; SIZE]);
+        assert_eq!(dst.end, 0);
+    }
+}
+
+#[test]
+fn memmove_forward() {
+    let mut arr: [u8; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
+    unsafe {
+        let src = arr.as_ptr().offset(6);
+        let dst = arr.as_mut_ptr().offset(3);
+        assert_eq!(memmove(dst, src, 5), dst);
+        assert_eq!(arr, [0, 1, 2, 6, 7, 8, 9, 10, 8, 9, 10, 11]);
+    }
+}
+
+#[test]
+fn memmove_backward() {
+    let mut arr: [u8; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
+    unsafe {
+        let src = arr.as_ptr().offset(3);
+        let dst = arr.as_mut_ptr().offset(6);
+        assert_eq!(memmove(dst, src, 5), dst);
+        assert_eq!(arr, [0, 1, 2, 3, 4, 5, 3, 4, 5, 6, 7, 11]);
+    }
+}
+
+#[test]
+fn memset_zero() {
+    let mut arr: [u8; 8] = [0, 1, 2, 3, 4, 5, 6, 7];
+    unsafe {
+        let ptr = arr.as_mut_ptr().offset(5);
+        assert_eq!(memset(ptr, 0, 2), ptr);
+        assert_eq!(arr, [0, 1, 2, 3, 4, 0, 0, 7]);
+
+        // Only the LSB matters for a memset
+        assert_eq!(memset(arr.as_mut_ptr(), 0x2000, 8), arr.as_mut_ptr());
+        assert_eq!(arr, [0, 0, 0, 0, 0, 0, 0, 0]);
+    }
+}
+
+#[test]
+fn memset_nonzero() {
+    let mut arr: [u8; 8] = [0, 1, 2, 3, 4, 5, 6, 7];
+    unsafe {
+        let ptr = arr.as_mut_ptr().offset(2);
+        assert_eq!(memset(ptr, 22, 3), ptr);
+        assert_eq!(arr, [0, 1, 22, 22, 22, 5, 6, 7]);
+
+        // Only the LSB matters for a memset
+        assert_eq!(memset(arr.as_mut_ptr(), 0x2009, 8), arr.as_mut_ptr());
+        assert_eq!(arr, [9, 9, 9, 9, 9, 9, 9, 9]);
+    }
+}
+
+#[test]
+fn memcmp_eq() {
+    let arr1 @ arr2 = gen_arr::<256>();
+    for i in 0..256 {
+        unsafe {
+            assert_eq!(memcmp(arr1.0.as_ptr(), arr2.0.as_ptr(), i), 0);
+            assert_eq!(memcmp(arr2.0.as_ptr(), arr1.0.as_ptr(), i), 0);
+        }
+    }
+}
+
+#[test]
+fn memcmp_ne() {
+    let arr1 @ arr2 = gen_arr::<256>();
+    // Reduce iteration count in Miri as it is too slow otherwise.
+    let limit = if cfg!(miri) { 64 } else { 256 };
+    for i in 0..limit {
+        let mut diff_arr = arr1;
+        diff_arr.0[i] = 127;
+        let expect = diff_arr.0[i].cmp(&arr2.0[i]);
+        for k in i + 1..limit {
+            let result = unsafe { memcmp(diff_arr.0.as_ptr(), arr2.0.as_ptr(), k) };
+            assert_eq!(expect, result.cmp(&0));
+        }
+    }
+}
+
+#[derive(Clone, Copy)]
+struct AlignedStorage<const N: usize>([u8; N], [usize; 0]);
+
+fn gen_arr<const N: usize>() -> AlignedStorage<N> {
+    let mut ret = AlignedStorage::<N>([0; N], []);
+    for i in 0..N {
+        ret.0[i] = i as u8;
+    }
+    ret
+}
+
+#[test]
+fn memmove_forward_misaligned_nonaligned_start() {
+    let mut arr = gen_arr::<32>();
+    let mut reference = arr;
+    unsafe {
+        let src = arr.0.as_ptr().offset(6);
+        let dst = arr.0.as_mut_ptr().offset(3);
+        assert_eq!(memmove(dst, src, 17), dst);
+        reference.0.copy_within(6..6 + 17, 3);
+        assert_eq!(arr.0, reference.0);
+    }
+}
+
+#[test]
+fn memmove_forward_misaligned_aligned_start() {
+    let mut arr = gen_arr::<32>();
+    let mut reference = arr;
+    unsafe {
+        let src = arr.0.as_ptr().offset(6);
+        let dst = arr.0.as_mut_ptr().add(0);
+        assert_eq!(memmove(dst, src, 17), dst);
+        reference.0.copy_within(6..6 + 17, 0);
+        assert_eq!(arr.0, reference.0);
+    }
+}
+
+#[test]
+fn memmove_forward_aligned() {
+    let mut arr = gen_arr::<32>();
+    let mut reference = arr;
+    unsafe {
+        let src = arr.0.as_ptr().add(3 + WORD_SIZE);
+        let dst = arr.0.as_mut_ptr().add(3);
+        assert_eq!(memmove(dst, src, 17), dst);
+        reference
+            .0
+            .copy_within(3 + WORD_SIZE..3 + WORD_SIZE + 17, 3);
+        assert_eq!(arr.0, reference.0);
+    }
+}
+
+#[test]
+fn memmove_backward_misaligned_nonaligned_start() {
+    let mut arr = gen_arr::<32>();
+    let mut reference = arr;
+    unsafe {
+        let src = arr.0.as_ptr().offset(3);
+        let dst = arr.0.as_mut_ptr().offset(6);
+        assert_eq!(memmove(dst, src, 17), dst);
+        reference.0.copy_within(3..3 + 17, 6);
+        assert_eq!(arr.0, reference.0);
+    }
+}
+
+#[test]
+fn memmove_backward_misaligned_aligned_start() {
+    let mut arr = gen_arr::<32>();
+    let mut reference = arr;
+    unsafe {
+        let src = arr.0.as_ptr().offset(3);
+        let dst = arr.0.as_mut_ptr().add(WORD_SIZE);
+        assert_eq!(memmove(dst, src, 17), dst);
+        reference.0.copy_within(3..3 + 17, WORD_SIZE);
+        assert_eq!(arr.0, reference.0);
+    }
+}
+
+#[test]
+fn memmove_backward_aligned() {
+    let mut arr = gen_arr::<32>();
+    let mut reference = arr;
+    unsafe {
+        let src = arr.0.as_ptr().add(3);
+        let dst = arr.0.as_mut_ptr().add(3 + WORD_SIZE);
+        assert_eq!(memmove(dst, src, 17), dst);
+        reference.0.copy_within(3..3 + 17, 3 + WORD_SIZE);
+        assert_eq!(arr.0, reference.0);
+    }
+}
+
+#[test]
+fn memmove_misaligned_bounds() {
+    // The above test have the downside that the addresses surrounding the range-to-copy are all
+    // still in-bounds, so Miri would not actually complain about OOB accesses. So we also test with
+    // an array that has just the right size. We test a few times to avoid it being accidentally
+    // aligned.
+    for _ in 0..8 {
+        let mut arr1 = [0u8; 17];
+        let mut arr2 = [0u8; 17];
+        unsafe {
+            // Copy both ways so we hit both the forward and backward cases.
+            memmove(arr1.as_mut_ptr(), arr2.as_mut_ptr(), 17);
+            memmove(arr2.as_mut_ptr(), arr1.as_mut_ptr(), 17);
+        }
+    }
+}
+
+#[test]
+fn memset_backward_misaligned_nonaligned_start() {
+    let mut arr = gen_arr::<32>();
+    let mut reference = arr;
+    unsafe {
+        let ptr = arr.0.as_mut_ptr().offset(6);
+        assert_eq!(memset(ptr, 0xCC, 17), ptr);
+        core::ptr::write_bytes(reference.0.as_mut_ptr().add(6), 0xCC, 17);
+        assert_eq!(arr.0, reference.0);
+    }
+}
+
+#[test]
+fn memset_backward_misaligned_aligned_start() {
+    let mut arr = gen_arr::<32>();
+    let mut reference = arr;
+    unsafe {
+        let ptr = arr.0.as_mut_ptr().add(WORD_SIZE);
+        assert_eq!(memset(ptr, 0xCC, 17), ptr);
+        core::ptr::write_bytes(reference.0.as_mut_ptr().add(WORD_SIZE), 0xCC, 17);
+        assert_eq!(arr.0, reference.0);
+    }
+}
+
+#[test]
+fn memset_backward_aligned() {
+    let mut arr = gen_arr::<32>();
+    let mut reference = arr;
+    unsafe {
+        let ptr = arr.0.as_mut_ptr().add(3 + WORD_SIZE);
+        assert_eq!(memset(ptr, 0xCC, 17), ptr);
+        core::ptr::write_bytes(reference.0.as_mut_ptr().add(3 + WORD_SIZE), 0xCC, 17);
+        assert_eq!(arr.0, reference.0);
+    }
+}
--- a/library/compiler-builtins/builtins-test/tests/misc.rs
+++ b/library/compiler-builtins/builtins-test/tests/misc.rs
@ -0,0 +1,202 @@
+// makes configuration easier
+#![allow(unused_macros)]
+
+use builtins_test::*;
+
+/// Make sure that the the edge case tester and randomized tester don't break, and list examples of
+/// fuzz values for documentation purposes.
+#[test]
+fn fuzz_values() {
+    const VALS: [u16; 47] = [
+        0b0, // edge cases
+        0b1111111111111111,
+        0b1111111111111110,
+        0b1111111111111100,
+        0b1111111110000000,
+        0b1111111100000000,
+        0b1110000000000000,
+        0b1100000000000000,
+        0b1000000000000000,
+        0b111111111111111,
+        0b111111111111110,
+        0b111111111111100,
+        0b111111110000000,
+        0b111111100000000,
+        0b110000000000000,
+        0b100000000000000,
+        0b11111111111111,
+        0b11111111111110,
+        0b11111111111100,
+        0b11111110000000,
+        0b11111100000000,
+        0b10000000000000,
+        0b111111111,
+        0b111111110,
+        0b111111100,
+        0b110000000,
+        0b100000000,
+        0b11111111,
+        0b11111110,
+        0b11111100,
+        0b10000000,
+        0b111,
+        0b110,
+        0b100,
+        0b11,
+        0b10,
+        0b1,
+        0b1010110100000, // beginning of random fuzzing
+        0b1100011001011010,
+        0b1001100101001111,
+        0b1101010100011010,
+        0b100010001,
+        0b1000000000000000,
+        0b1100000000000101,
+        0b1100111101010101,
+        0b1100010111111111,
+        0b1111110101111111,
+    ];
+    let mut i = 0;
+    fuzz(10, |x: u16| {
+        assert_eq!(x, VALS[i]);
+        i += 1;
+    });
+}
+
+#[test]
+fn leading_zeros() {
+    use compiler_builtins::int::leading_zeros::{leading_zeros_default, leading_zeros_riscv};
+    {
+        use compiler_builtins::int::leading_zeros::__clzsi2;
+        fuzz(N, |x: u32| {
+            if x == 0 {
+                return; // undefined value for an intrinsic
+            }
+            let lz = x.leading_zeros() as usize;
+            let lz0 = __clzsi2(x);
+            let lz1 = leading_zeros_default(x);
+            let lz2 = leading_zeros_riscv(x);
+            if lz0 != lz {
+                panic!("__clzsi2({x}): std: {lz}, builtins: {lz0}");
+            }
+            if lz1 != lz {
+                panic!("leading_zeros_default({x}): std: {lz}, builtins: {lz1}");
+            }
+            if lz2 != lz {
+                panic!("leading_zeros_riscv({x}): std: {lz}, builtins: {lz2}");
+            }
+        });
+    }
+
+    {
+        use compiler_builtins::int::leading_zeros::__clzdi2;
+        fuzz(N, |x: u64| {
+            if x == 0 {
+                return; // undefined value for an intrinsic
+            }
+            let lz = x.leading_zeros() as usize;
+            let lz0 = __clzdi2(x);
+            let lz1 = leading_zeros_default(x);
+            let lz2 = leading_zeros_riscv(x);
+            if lz0 != lz {
+                panic!("__clzdi2({x}): std: {lz}, builtins: {lz0}");
+            }
+            if lz1 != lz {
+                panic!("leading_zeros_default({x}): std: {lz}, builtins: {lz1}");
+            }
+            if lz2 != lz {
+                panic!("leading_zeros_riscv({x}): std: {lz}, builtins: {lz2}");
+            }
+        });
+    }
+
+    {
+        use compiler_builtins::int::leading_zeros::__clzti2;
+        fuzz(N, |x: u128| {
+            if x == 0 {
+                return; // undefined value for an intrinsic
+            }
+            let lz = x.leading_zeros() as usize;
+            let lz0 = __clzti2(x);
+            if lz0 != lz {
+                panic!("__clzti2({x}): std: {lz}, builtins: {lz0}");
+            }
+        });
+    }
+}
+
+#[test]
+fn trailing_zeros() {
+    use compiler_builtins::int::trailing_zeros::{__ctzdi2, __ctzsi2, __ctzti2, trailing_zeros};
+    fuzz(N, |x: u32| {
+        if x == 0 {
+            return; // undefined value for an intrinsic
+        }
+        let tz = x.trailing_zeros() as usize;
+        let tz0 = __ctzsi2(x);
+        let tz1 = trailing_zeros(x);
+        if tz0 != tz {
+            panic!("__ctzsi2({x}): std: {tz}, builtins: {tz0}");
+        }
+        if tz1 != tz {
+            panic!("trailing_zeros({x}): std: {tz}, builtins: {tz1}");
+        }
+    });
+    fuzz(N, |x: u64| {
+        if x == 0 {
+            return; // undefined value for an intrinsic
+        }
+        let tz = x.trailing_zeros() as usize;
+        let tz0 = __ctzdi2(x);
+        let tz1 = trailing_zeros(x);
+        if tz0 != tz {
+            panic!("__ctzdi2({x}): std: {tz}, builtins: {tz0}");
+        }
+        if tz1 != tz {
+            panic!("trailing_zeros({x}): std: {tz}, builtins: {tz1}");
+        }
+    });
+    fuzz(N, |x: u128| {
+        if x == 0 {
+            return; // undefined value for an intrinsic
+        }
+        let tz = x.trailing_zeros() as usize;
+        let tz0 = __ctzti2(x);
+        if tz0 != tz {
+            panic!("__ctzti2({x}): std: {tz}, builtins: {tz0}");
+        }
+    });
+}
+
+#[test]
+fn bswap() {
+    use compiler_builtins::int::bswap::{__bswapdi2, __bswapsi2};
+    fuzz(N, |x: u32| {
+        assert_eq!(x.swap_bytes(), __bswapsi2(x));
+    });
+    fuzz(N, |x: u64| {
+        assert_eq!(x.swap_bytes(), __bswapdi2(x));
+    });
+
+    assert_eq!(__bswapsi2(0x12345678u32), 0x78563412u32);
+    assert_eq!(__bswapsi2(0x00000001u32), 0x01000000u32);
+    assert_eq!(__bswapdi2(0x123456789ABCDEF0u64), 0xF0DEBC9A78563412u64);
+    assert_eq!(__bswapdi2(0x0200000001000000u64), 0x0000000100000002u64);
+
+    #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+    {
+        use compiler_builtins::int::bswap::__bswapti2;
+        fuzz(N, |x: u128| {
+            assert_eq!(x.swap_bytes(), __bswapti2(x));
+        });
+
+        assert_eq!(
+            __bswapti2(0x123456789ABCDEF013579BDF02468ACEu128),
+            0xCE8A4602DF9B5713F0DEBC9A78563412u128
+        );
+        assert_eq!(
+            __bswapti2(0x04000000030000000200000001000000u128),
+            0x00000001000000020000000300000004u128
+        );
+    }
+}
--- a/library/compiler-builtins/builtins-test/tests/mul.rs
+++ b/library/compiler-builtins/builtins-test/tests/mul.rs
@ -0,0 +1,150 @@
+#![allow(unused_macros)]
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::*;
+
+mod int_mul {
+    use super::*;
+
+    macro_rules! mul {
+        ($($i:ty, $fn:ident);*;) => {
+            $(
+                #[test]
+                fn $fn() {
+                    use compiler_builtins::int::mul::$fn;
+
+                    fuzz_2(N, |x: $i, y: $i| {
+                        let mul0 = x.wrapping_mul(y);
+                        let mul1: $i = $fn(x, y);
+                        if mul0 != mul1 {
+                            panic!(
+                                "{func}({x}, {y}): std: {mul0}, builtins: {mul1}",
+                                func = stringify!($fn),
+                            );
+                        }
+                    });
+
+                }
+            )*
+        };
+    }
+
+    mul! {
+        u64, __muldi3;
+        i128, __multi3;
+    }
+}
+
+mod int_overflowing_mul {
+    use super::*;
+
+    macro_rules! overflowing_mul {
+        ($($i:ty, $fn:ident);*;) => {
+            $(
+                #[test]
+                fn $fn() {
+                    use compiler_builtins::int::mul::$fn;
+
+                    fuzz_2(N, |x: $i, y: $i| {
+                        let (mul0, o0) = x.overflowing_mul(y);
+                        let mut o1 = 0i32;
+                        let mul1: $i = $fn(x, y, &mut o1);
+                        let o1 = o1 != 0;
+                        if mul0 != mul1 || o0 != o1 {
+                            panic!(
+                                "{func}({x}, {y}): std: ({mul0}, {o0}), builtins: ({mul1}, {o1})",
+                                func = stringify!($fn),
+                            );
+                        }
+                    });
+                }
+            )*
+        };
+    }
+
+    overflowing_mul! {
+        i32, __mulosi4;
+        i64, __mulodi4;
+        i128, __muloti4;
+    }
+
+    #[test]
+    fn overflowing_mul_u128() {
+        use compiler_builtins::int::mul::{__rust_i128_mulo, __rust_u128_mulo};
+
+        fuzz_2(N, |x: u128, y: u128| {
+            let mut o1 = 0;
+            let (mul0, o0) = x.overflowing_mul(y);
+            let mul1 = __rust_u128_mulo(x, y, &mut o1);
+            if mul0 != mul1 || i32::from(o0) != o1 {
+                panic!("__rust_u128_mulo({x}, {y}): std: ({mul0}, {o0}), builtins: ({mul1}, {o1})",);
+            }
+            let x = x as i128;
+            let y = y as i128;
+            let (mul0, o0) = x.overflowing_mul(y);
+            let mul1 = __rust_i128_mulo(x, y, &mut o1);
+            if mul0 != mul1 || i32::from(o0) != o1 {
+                panic!("__rust_i128_mulo({x}, {y}): std: ({mul0}, {o0}), builtins: ({mul1}, {o1})",);
+            }
+        });
+    }
+}
+
+macro_rules! float_mul {
+    ($($f:ty, $fn:ident, $apfloat_ty:ident, $sys_available:meta);*;) => {
+        $(
+            #[test]
+            fn $fn() {
+                use compiler_builtins::float::{mul::$fn, Float};
+                use core::ops::Mul;
+
+                fuzz_float_2(N, |x: $f, y: $f| {
+                    let mul0 = apfloat_fallback!($f, $apfloat_ty, $sys_available, Mul::mul, x, y);
+                    let mul1: $f = $fn(x, y);
+                    if !Float::eq_repr(mul0, mul1) {
+                        panic!(
+                            "{func}({x:?}, {y:?}): std: {mul0:?}, builtins: {mul1:?}",
+                            func = stringify!($fn),
+                        );
+                    }
+                });
+            }
+        )*
+    };
+}
+
+#[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
+mod float_mul {
+    use super::*;
+
+    // FIXME(#616): Stop ignoring arches that don't have native support once fix for builtins is in
+    // nightly.
+    float_mul! {
+        f32, __mulsf3, Single, not(target_arch = "arm");
+        f64, __muldf3, Double, not(target_arch = "arm");
+    }
+}
+
+#[cfg(f128_enabled)]
+#[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
+#[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+mod float_mul_f128 {
+    use super::*;
+
+    float_mul! {
+        f128, __multf3, Quad,
+        // FIXME(llvm): there is a bug in LLVM rt.
+        // See <https://github.com/llvm/llvm-project/issues/91840>.
+        not(any(feature = "no-sys-f128", all(target_arch = "aarch64", target_os = "linux")));
+    }
+}
+
+#[cfg(f128_enabled)]
+#[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+mod float_mul_f128_ppc {
+    use super::*;
+
+    float_mul! {
+        f128, __mulkf3, Quad, not(feature = "no-sys-f128");
+    }
+}
--- a/library/compiler-builtins/builtins-test/tests/shift.rs
+++ b/library/compiler-builtins/builtins-test/tests/shift.rs
@ -0,0 +1,35 @@
+use builtins_test::*;
+
+macro_rules! shift {
+    ($($i:ty, $fn_std:ident, $fn_builtins:ident);*;) => {
+        $(
+            #[test]
+            fn $fn_builtins() {
+                use compiler_builtins::int::shift::$fn_builtins;
+
+                fuzz_shift(|x: $i, s: u32| {
+                    let tmp0: $i = x.$fn_std(s);
+                    let tmp1: $i = $fn_builtins(x, s);
+                    if tmp0 != tmp1 {
+                        panic!(
+                            "{}({}, {}): std: {}, builtins: {}",
+                            stringify!($fn_builtins), x, s, tmp0, tmp1
+                        );
+                    }
+                });
+            }
+        )*
+    };
+}
+
+shift! {
+    u32, wrapping_shl, __ashlsi3;
+    u64, wrapping_shl, __ashldi3;
+    u128, wrapping_shl, __ashlti3;
+    i32, wrapping_shr, __ashrsi3;
+    i64, wrapping_shr, __ashrdi3;
+    i128, wrapping_shr, __ashrti3;
+    u32, wrapping_shr, __lshrsi3;
+    u64, wrapping_shr, __lshrdi3;
+    u128, wrapping_shr, __lshrti3;
+}
--- a/library/compiler-builtins/ci/bench-icount.sh
+++ b/library/compiler-builtins/ci/bench-icount.sh
@ -0,0 +1,58 @@
+#!/bin/bash
+
+set -eux
+
+iai_home="iai-home"
+
+# Download the baseline from master
+./ci/ci-util.py locate-baseline --download --extract
+
+# Run benchmarks once
+function run_icount_benchmarks() {
+    cargo_args=(
+        "--bench" "icount"
+        "--no-default-features"
+        "--features" "unstable,unstable-float,icount"
+    )
+
+    iai_args=(
+        "--home" "$(pwd)/$iai_home"
+        "--regression=ir=5.0"
+        "--save-summary"
+    )
+
+    # Parse `cargo_arg0 cargo_arg1 -- iai_arg0 iai_arg1` syntax
+    parsing_iai_args=0
+    while [ "$#" -gt 0 ]; do
+        if [ "$parsing_iai_args" == "1" ]; then
+            iai_args+=("$1")
+        elif [ "$1" == "--" ]; then
+            parsing_iai_args=1
+        else
+            cargo_args+=("$1")
+        fi
+
+        shift
+    done
+
+    # Run iai-callgrind benchmarks
+    cargo bench "${cargo_args[@]}" -- "${iai_args[@]}"
+
+    # NB: iai-callgrind should exit on error but does not, so we inspect the sumary
+    # for errors. See  https://github.com/iai-callgrind/iai-callgrind/issues/337
+    if [ -n "${PR_NUMBER:-}" ]; then
+        # If this is for a pull request, ignore regressions if specified.
+        ./ci/ci-util.py check-regressions --home "$iai_home" --allow-pr-override "$PR_NUMBER"
+    else
+        ./ci/ci-util.py check-regressions --home "$iai_home" || true
+    fi
+}
+
+# Run once with softfloats, once with arch instructions enabled
+run_icount_benchmarks --features force-soft-floats -- --save-baseline=softfloat
+run_icount_benchmarks -- --save-baseline=hardfloat
+
+# Name and tar the new baseline
+name="baseline-icount-$(date -u +'%Y%m%d%H%M')-${GITHUB_SHA:0:12}"
+echo "BASELINE_NAME=$name" >>"$GITHUB_ENV"
+tar cJf "$name.tar.xz" "$iai_home"
--- a/library/compiler-builtins/ci/ci-util.py
+++ b/library/compiler-builtins/ci/ci-util.py
@ -0,0 +1,438 @@
+#!/usr/bin/env python3
+"""Utilities for CI.
+
+This dynamically prepares a list of routines that had a source file change based on
+git history.
+"""
+
+import json
+import os
+import re
+import subprocess as sp
+import sys
+from dataclasses import dataclass
+from glob import glob, iglob
+from inspect import cleandoc
+from os import getenv
+from pathlib import Path
+from typing import TypedDict, Self
+
+USAGE = cleandoc(
+    """
+    usage:
+
+    ./ci/ci-util.py <COMMAND> [flags]
+
+    COMMAND:
+        generate-matrix
+            Calculate a matrix of which functions had source change, print that as
+            a JSON object.
+
+        locate-baseline [--download] [--extract]
+            Locate the most recent benchmark baseline available in CI and, if flags
+            specify, download and extract it. Never exits with nonzero status if
+            downloading fails.
+
+            Note that `--extract` will overwrite files in `iai-home`.
+
+        check-regressions [--home iai-home] [--allow-pr-override pr_number]
+            Check `iai-home` (or `iai-home` if unspecified) for `summary.json`
+            files and see if there are any regressions. This is used as a workaround
+            for `iai-callgrind` not exiting with error status; see
+            <https://github.com/iai-callgrind/iai-callgrind/issues/337>.
+
+            If `--allow-pr-override` is specified, the regression check will not exit
+            with failure if any line in the PR starts with `allow-regressions`.
+    """
+)
+
+REPO_ROOT = Path(__file__).parent.parent
+GIT = ["git", "-C", REPO_ROOT]
+DEFAULT_BRANCH = "master"
+WORKFLOW_NAME = "CI"  # Workflow that generates the benchmark artifacts
+ARTIFACT_GLOB = "baseline-icount*"
+# Place this in a PR body to skip regression checks (must be at the start of a line).
+REGRESSION_DIRECTIVE = "ci: allow-regressions"
+# Place this in a PR body to skip extensive tests
+SKIP_EXTENSIVE_DIRECTIVE = "ci: skip-extensive"
+# Place this in a PR body to allow running a large number of extensive tests. If not
+# set, this script will error out if a threshold is exceeded in order to avoid
+# accidentally spending huge amounts of CI time.
+ALLOW_MANY_EXTENSIVE_DIRECTIVE = "ci: allow-many-extensive"
+MANY_EXTENSIVE_THRESHOLD = 20
+
+# Don't run exhaustive tests if these files change, even if they contaiin a function
+# definition.
+IGNORE_FILES = [
+    "libm/src/math/support/",
+    "libm/src/libm_helper.rs",
+    "libm/src/math/arch/intrinsics.rs",
+]
+
+# libm PR CI takes a long time and doesn't need to run unless relevant files have been
+# changed. Anything matching this regex pattern will trigger a run.
+TRIGGER_LIBM_PR_CI = ".*(libm|musl).*"
+
+TYPES = ["f16", "f32", "f64", "f128"]
+
+
+def eprint(*args, **kwargs):
+    """Print to stderr."""
+    print(*args, file=sys.stderr, **kwargs)
+
+
+@dataclass
+class PrInfo:
+    """GitHub response for PR query"""
+
+    body: str
+    commits: list[str]
+    created_at: str
+    number: int
+
+    @classmethod
+    def load(cls, pr_number: int | str) -> Self:
+        """For a given PR number, query the body and commit list"""
+        pr_info = sp.check_output(
+            [
+                "gh",
+                "pr",
+                "view",
+                str(pr_number),
+                "--json=number,commits,body,createdAt",
+                # Flatten the commit list to only hashes, change a key to snake naming
+                "--jq=.commits |= map(.oid) | .created_at = .createdAt | del(.createdAt)",
+            ],
+            text=True,
+        )
+        eprint("PR info:", json.dumps(pr_info, indent=4))
+        return cls(**json.loads(pr_info))
+
+    def contains_directive(self, directive: str) -> bool:
+        """Return true if the provided directive is on a line in the PR body"""
+        lines = self.body.splitlines()
+        return any(line.startswith(directive) for line in lines)
+
+
+class FunctionDef(TypedDict):
+    """Type for an entry in `function-definitions.json`"""
+
+    sources: list[str]
+    type: str
+
+
+class Context:
+    gh_ref: str | None
+    changed: list[Path]
+    defs: dict[str, FunctionDef]
+
+    def __init__(self) -> None:
+        self.gh_ref = getenv("GITHUB_REF")
+        self.changed = []
+        self._init_change_list()
+
+        with open(REPO_ROOT.joinpath("etc/function-definitions.json")) as f:
+            defs = json.load(f)
+
+        defs.pop("__comment", None)
+        self.defs = defs
+
+    def _init_change_list(self):
+        """Create a list of files that have been changed. This uses GITHUB_REF if
+        available, otherwise a diff between `HEAD` and `master`.
+        """
+
+        # For pull requests, GitHub creates a ref `refs/pull/1234/merge` (1234 being
+        # the PR number), and sets this as `GITHUB_REF`.
+        ref = self.gh_ref
+        eprint(f"using ref `{ref}`")
+        if not self.is_pr():
+            # If the ref is not for `merge` then we are not in PR CI
+            eprint("No diff available for ref")
+            return
+
+        # The ref is for a dummy merge commit. We can extract the merge base by
+        # inspecting all parents (`^@`).
+        merge_sha = sp.check_output(
+            GIT + ["show-ref", "--hash", ref], text=True
+        ).strip()
+        merge_log = sp.check_output(GIT + ["log", "-1", merge_sha], text=True)
+        eprint(f"Merge:\n{merge_log}\n")
+
+        parents = (
+            sp.check_output(GIT + ["rev-parse", f"{merge_sha}^@"], text=True)
+            .strip()
+            .splitlines()
+        )
+        assert len(parents) == 2, f"expected two-parent merge but got:\n{parents}"
+        base = parents[0].strip()
+        incoming = parents[1].strip()
+
+        eprint(f"base: {base}, incoming: {incoming}")
+        textlist = sp.check_output(
+            GIT + ["diff", base, incoming, "--name-only"], text=True
+        )
+        self.changed = [Path(p) for p in textlist.splitlines()]
+
+    def is_pr(self) -> bool:
+        """Check if we are looking at a PR rather than a push."""
+        return self.gh_ref is not None and "merge" in self.gh_ref
+
+    @staticmethod
+    def _ignore_file(fname: str) -> bool:
+        return any(fname.startswith(pfx) for pfx in IGNORE_FILES)
+
+    def changed_routines(self) -> dict[str, list[str]]:
+        """Create a list of routines for which one or more files have been updated,
+        separated by type.
+        """
+        routines = set()
+        for name, meta in self.defs.items():
+            # Don't update if changes to the file should be ignored
+            sources = (f for f in meta["sources"] if not self._ignore_file(f))
+
+            # Select changed files
+            changed = [f for f in sources if Path(f) in self.changed]
+
+            if len(changed) > 0:
+                eprint(f"changed files for {name}: {changed}")
+                routines.add(name)
+
+        ret: dict[str, list[str]] = {}
+        for r in sorted(routines):
+            ret.setdefault(self.defs[r]["type"], []).append(r)
+
+        return ret
+
+    def may_skip_libm_ci(self) -> bool:
+        """If this is a PR and no libm files were changed, allow skipping libm
+        jobs."""
+
+        if self.is_pr():
+            return all(not re.match(TRIGGER_LIBM_PR_CI, str(f)) for f in self.changed)
+
+        return False
+
+    def emit_workflow_output(self):
+        """Create a JSON object a list items for each type's changed files, if any
+        did change, and the routines that were affected by the change.
+        """
+
+        pr_number = os.environ.get("PR_NUMBER")
+        skip_tests = False
+        error_on_many_tests = False
+
+        if pr_number is not None and len(pr_number) > 0:
+            pr = PrInfo.load(pr_number)
+            skip_tests = pr.contains_directive(SKIP_EXTENSIVE_DIRECTIVE)
+            error_on_many_tests = not pr.contains_directive(
+                ALLOW_MANY_EXTENSIVE_DIRECTIVE
+            )
+
+            if skip_tests:
+                eprint("Skipping all extensive tests")
+
+        changed = self.changed_routines()
+        matrix = []
+        total_to_test = 0
+
+        # Figure out which extensive tests need to run
+        for ty in TYPES:
+            ty_changed = changed.get(ty, [])
+            ty_to_test = [] if skip_tests else ty_changed
+            total_to_test += len(ty_to_test)
+
+            item = {
+                "ty": ty,
+                "changed": ",".join(ty_changed),
+                "to_test": ",".join(ty_to_test),
+            }
+
+            matrix.append(item)
+
+        ext_matrix = json.dumps({"extensive_matrix": matrix}, separators=(",", ":"))
+        may_skip = str(self.may_skip_libm_ci()).lower()
+        print(f"extensive_matrix={ext_matrix}")
+        print(f"may_skip_libm_ci={may_skip}")
+        eprint(f"extensive_matrix={ext_matrix}")
+        eprint(f"may_skip_libm_ci={may_skip}")
+        eprint(f"total extensive tests: {total_to_test}")
+
+        if error_on_many_tests and total_to_test > MANY_EXTENSIVE_THRESHOLD:
+            eprint(
+                f"More than {MANY_EXTENSIVE_THRESHOLD} tests would be run; add"
+                f" `{ALLOW_MANY_EXTENSIVE_DIRECTIVE}` to the PR body if this is"
+                " intentional. If this is refactoring that happens to touch a lot of"
+                f" files, `{SKIP_EXTENSIVE_DIRECTIVE}` can be used instead."
+            )
+            exit(1)
+
+
+def locate_baseline(flags: list[str]) -> None:
+    """Find the most recent baseline from CI, download it if specified.
+
+    This returns rather than erroring, even if the `gh` commands fail. This is to avoid
+    erroring in CI if the baseline is unavailable (artifact time limit exceeded, first
+    run on the branch, etc).
+    """
+
+    download = False
+    extract = False
+
+    while len(flags) > 0:
+        match flags[0]:
+            case "--download":
+                download = True
+            case "--extract":
+                extract = True
+            case _:
+                eprint(USAGE)
+                exit(1)
+        flags = flags[1:]
+
+    if extract and not download:
+        eprint("cannot extract without downloading")
+        exit(1)
+
+    try:
+        # Locate the most recent job to complete with success on our branch
+        latest_job = sp.check_output(
+            [
+                "gh",
+                "run",
+                "list",
+                "--status=success",
+                f"--branch={DEFAULT_BRANCH}",
+                "--json=databaseId,url,headSha,conclusion,createdAt,"
+                "status,workflowDatabaseId,workflowName",
+                # Return the first array element matching our workflow name. NB: cannot
+                # just use `--limit=1`, jq filtering happens after limiting. We also
+                # cannot just use `--workflow` because GH gets confused from
+                # different file names in history.
+                f'--jq=[.[] | select(.workflowName == "{WORKFLOW_NAME}")][0]',
+            ],
+            text=True,
+        )
+    except sp.CalledProcessError as e:
+        eprint(f"failed to run github command: {e}")
+        return
+
+    try:
+        latest = json.loads(latest_job)
+        eprint("latest job: ", json.dumps(latest, indent=4))
+    except json.JSONDecodeError as e:
+        eprint(f"failed to decode json '{latest_job}', {e}")
+        return
+
+    if not download:
+        eprint("--download not specified, returning")
+        return
+
+    job_id = latest.get("databaseId")
+    if job_id is None:
+        eprint("skipping download step")
+        return
+
+    sp.run(
+        ["gh", "run", "download", str(job_id), f"--pattern={ARTIFACT_GLOB}"],
+        check=False,
+    )
+
+    if not extract:
+        eprint("skipping extraction step")
+        return
+
+    # Find the baseline with the most recent timestamp. GH downloads the files to e.g.
+    # `some-dirname/some-dirname.tar.xz`, so just glob the whole thing together.
+    candidate_baselines = glob(f"{ARTIFACT_GLOB}/{ARTIFACT_GLOB}")
+    if len(candidate_baselines) == 0:
+        eprint("no possible baseline directories found")
+        return
+
+    candidate_baselines.sort(reverse=True)
+    baseline_archive = candidate_baselines[0]
+    eprint(f"extracting {baseline_archive}")
+    sp.run(["tar", "xJvf", baseline_archive], check=True)
+    eprint("baseline extracted successfully")
+
+
+def check_iai_regressions(args: list[str]):
+    """Find regressions in iai summary.json files, exit with failure if any are
+    found.
+    """
+
+    iai_home_str = "iai-home"
+    pr_number = None
+
+    while len(args) > 0:
+        match args:
+            case ["--home", home, *rest]:
+                iai_home_str = home
+                args = rest
+            case ["--allow-pr-override", pr_num, *rest]:
+                pr_number = pr_num
+                args = rest
+            case _:
+                eprint(USAGE)
+                exit(1)
+
+    iai_home = Path(iai_home_str)
+
+    found_summaries = False
+    regressions: list[dict] = []
+    for summary_path in iglob("**/summary.json", root_dir=iai_home, recursive=True):
+        found_summaries = True
+        with open(iai_home / summary_path, "r") as f:
+            summary = json.load(f)
+
+        summary_regs = []
+        run = summary["callgrind_summary"]["callgrind_run"]
+        fname = summary["function_name"]
+        id = summary["id"]
+        name_entry = {"name": f"{fname}.{id}"}
+
+        for segment in run["segments"]:
+            summary_regs.extend(segment["regressions"])
+
+        summary_regs.extend(run["total"]["regressions"])
+
+        regressions.extend(name_entry | reg for reg in summary_regs)
+
+    if not found_summaries:
+        eprint(f"did not find any summary.json files within {iai_home}")
+        exit(1)
+
+    if len(regressions) == 0:
+        eprint("No regressions found")
+        return
+
+    eprint("Found regressions:", json.dumps(regressions, indent=4))
+
+    if pr_number is not None:
+        pr = PrInfo.load(pr_number)
+        if pr.contains_directive(REGRESSION_DIRECTIVE):
+            eprint("PR allows regressions, returning")
+            return
+
+    exit(1)
+
+
+def main():
+    match sys.argv[1:]:
+        case ["generate-matrix"]:
+            ctx = Context()
+            ctx.emit_workflow_output()
+        case ["locate-baseline", *flags]:
+            locate_baseline(flags)
+        case ["check-regressions", *args]:
+            check_iai_regressions(args)
+        case ["--help" | "-h"]:
+            print(USAGE)
+            exit()
+        case _:
+            eprint(USAGE)
+            exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/library/compiler-builtins/ci/docker/aarch64-unknown-linux-gnu/Dockerfile
+++ b/library/compiler-builtins/ci/docker/aarch64-unknown-linux-gnu/Dockerfile
@ -0,0 +1,16 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev ca-certificates \
+    gcc-aarch64-linux-gnu m4 make libc6-dev-arm64-cross \
+    qemu-user-static
+
+ENV TOOLCHAIN_PREFIX=aarch64-linux-gnu-
+ENV CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER="$TOOLCHAIN_PREFIX"gcc \
+    CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUNNER=qemu-aarch64-static \
+    AR_aarch64_unknown_linux_gnu="$TOOLCHAIN_PREFIX"ar \
+    CC_aarch64_unknown_linux_gnu="$TOOLCHAIN_PREFIX"gcc \
+    QEMU_LD_PREFIX=/usr/aarch64-linux-gnu \
+    RUST_TEST_THREADS=1
--- a/library/compiler-builtins/ci/docker/arm-unknown-linux-gnueabi/Dockerfile
+++ b/library/compiler-builtins/ci/docker/arm-unknown-linux-gnueabi/Dockerfile
@ -0,0 +1,15 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev ca-certificates \
+    gcc-arm-linux-gnueabi libc6-dev-armel-cross qemu-user-static
+
+ENV TOOLCHAIN_PREFIX=arm-linux-gnueabi-
+ENV CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABI_LINKER="$TOOLCHAIN_PREFIX"gcc \
+    CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABI_RUNNER=qemu-arm-static \
+    AR_arm_unknown_linux_gnueabi="$TOOLCHAIN_PREFIX"ar \
+    CC_arm_unknown_linux_gnueabi="$TOOLCHAIN_PREFIX"gcc \
+    QEMU_LD_PREFIX=/usr/arm-linux-gnueabi \
+    RUST_TEST_THREADS=1
--- a/library/compiler-builtins/ci/docker/arm-unknown-linux-gnueabihf/Dockerfile
+++ b/library/compiler-builtins/ci/docker/arm-unknown-linux-gnueabihf/Dockerfile
@ -0,0 +1,15 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev ca-certificates \
+    gcc-arm-linux-gnueabihf libc6-dev-armhf-cross qemu-user-static
+
+ENV TOOLCHAIN_PREFIX=arm-linux-gnueabihf-
+ENV CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABIHF_LINKER="$TOOLCHAIN_PREFIX"gcc \
+    CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABIHF_RUNNER=qemu-arm-static \
+    AR_arm_unknown_linux_gnueabihf="$TOOLCHAIN_PREFIX"ar \
+    CC_arm_unknown_linux_gnueabihf="$TOOLCHAIN_PREFIX"gcc \
+    QEMU_LD_PREFIX=/usr/arm-linux-gnueabihf \
+    RUST_TEST_THREADS=1
--- a/library/compiler-builtins/ci/docker/armv7-unknown-linux-gnueabihf/Dockerfile
+++ b/library/compiler-builtins/ci/docker/armv7-unknown-linux-gnueabihf/Dockerfile
@ -0,0 +1,15 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev ca-certificates \
+    gcc-arm-linux-gnueabihf libc6-dev-armhf-cross qemu-user-static
+
+ENV TOOLCHAIN_PREFIX=arm-linux-gnueabihf-
+ENV CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_LINKER="$TOOLCHAIN_PREFIX"gcc \
+    CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_RUNNER=qemu-arm-static \
+    AR_armv7_unknown_linux_gnueabihf="$TOOLCHAIN_PREFIX"ar \
+    CC_armv7_unknown_linux_gnueabihf="$TOOLCHAIN_PREFIX"gcc \
+    QEMU_LD_PREFIX=/usr/arm-linux-gnueabihf \
+    RUST_TEST_THREADS=1
--- a/library/compiler-builtins/ci/docker/i586-unknown-linux-gnu/Dockerfile
+++ b/library/compiler-builtins/ci/docker/i586-unknown-linux-gnu/Dockerfile
@ -0,0 +1,6 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc-multilib m4 make libc6-dev ca-certificates
--- a/library/compiler-builtins/ci/docker/i686-unknown-linux-gnu/Dockerfile
+++ b/library/compiler-builtins/ci/docker/i686-unknown-linux-gnu/Dockerfile
@ -0,0 +1,6 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc-multilib m4 make libc6-dev ca-certificates
--- a/library/compiler-builtins/ci/docker/loongarch64-unknown-linux-gnu/Dockerfile
+++ b/library/compiler-builtins/ci/docker/loongarch64-unknown-linux-gnu/Dockerfile
@ -0,0 +1,14 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev qemu-user-static ca-certificates \
+    gcc-14-loongarch64-linux-gnu libc6-dev-loong64-cross
+
+ENV CARGO_TARGET_LOONGARCH64_UNKNOWN_LINUX_GNU_LINKER=loongarch64-linux-gnu-gcc-14 \
+    CARGO_TARGET_LOONGARCH64_UNKNOWN_LINUX_GNU_RUNNER=qemu-loongarch64-static \
+    AR_loongarch64_unknown_linux_gnu=loongarch64-linux-gnu-ar \
+    CC_loongarch64_unknown_linux_gnu=loongarch64-linux-gnu-gcc-14 \
+    QEMU_LD_PREFIX=/usr/loongarch64-linux-gnu \
+    RUST_TEST_THREADS=1
--- a/library/compiler-builtins/ci/docker/mips-unknown-linux-gnu/Dockerfile
+++ b/library/compiler-builtins/ci/docker/mips-unknown-linux-gnu/Dockerfile
@ -0,0 +1,16 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev ca-certificates \
+    gcc-mips-linux-gnu libc6-dev-mips-cross \
+    binfmt-support qemu-user-static qemu-system-mips
+
+ENV TOOLCHAIN_PREFIX=mips-linux-gnu-
+ENV CARGO_TARGET_MIPS_UNKNOWN_LINUX_GNU_LINKER="$TOOLCHAIN_PREFIX"gcc \
+    CARGO_TARGET_MIPS_UNKNOWN_LINUX_GNU_RUNNER=qemu-mips-static \
+    AR_mips_unknown_linux_gnu="$TOOLCHAIN_PREFIX"ar \
+    CC_mips_unknown_linux_gnu="$TOOLCHAIN_PREFIX"gcc \
+    QEMU_LD_PREFIX=/usr/mips-linux-gnu \
+    RUST_TEST_THREADS=1
--- a/library/compiler-builtins/ci/docker/mips64-unknown-linux-gnuabi64/Dockerfile
+++ b/library/compiler-builtins/ci/docker/mips64-unknown-linux-gnuabi64/Dockerfile
@ -0,0 +1,20 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    ca-certificates \
+    gcc \
+    gcc-mips64-linux-gnuabi64 \
+    libc6-dev \
+    libc6-dev-mips64-cross \
+    qemu-user-static \
+    qemu-system-mips
+
+ENV TOOLCHAIN_PREFIX=mips64-linux-gnuabi64-
+ENV CARGO_TARGET_MIPS64_UNKNOWN_LINUX_GNUABI64_LINKER="$TOOLCHAIN_PREFIX"gcc \
+    CARGO_TARGET_MIPS64_UNKNOWN_LINUX_GNUABI64_RUNNER=qemu-mips64-static \
+    AR_mips64_unknown_linux_gnuabi64="$TOOLCHAIN_PREFIX"ar \
+    CC_mips64_unknown_linux_gnuabi64="$TOOLCHAIN_PREFIX"gcc \
+    QEMU_LD_PREFIX=/usr/mips64-linux-gnuabi64 \
+    RUST_TEST_THREADS=1
--- a/library/compiler-builtins/ci/docker/mips64el-unknown-linux-gnuabi64/Dockerfile
+++ b/library/compiler-builtins/ci/docker/mips64el-unknown-linux-gnuabi64/Dockerfile
@ -0,0 +1,19 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    ca-certificates \
+    gcc \
+    gcc-mips64el-linux-gnuabi64 \
+    libc6-dev \
+    libc6-dev-mips64el-cross \
+    qemu-user-static
+
+ENV TOOLCHAIN_PREFIX=mips64el-linux-gnuabi64-
+ENV CARGO_TARGET_MIPS64EL_UNKNOWN_LINUX_GNUABI64_LINKER="$TOOLCHAIN_PREFIX"gcc \
+    CARGO_TARGET_MIPS64EL_UNKNOWN_LINUX_GNUABI64_RUNNER=qemu-mips64el-static \
+    AR_mips64el_unknown_linux_gnuabi64="$TOOLCHAIN_PREFIX"ar \
+    CC_mips64el_unknown_linux_gnuabi64="$TOOLCHAIN_PREFIX"gcc \
+    QEMU_LD_PREFIX=/usr/mips64el-linux-gnuabi64 \
+    RUST_TEST_THREADS=1
--- a/library/compiler-builtins/ci/docker/mipsel-unknown-linux-gnu/Dockerfile
+++ b/library/compiler-builtins/ci/docker/mipsel-unknown-linux-gnu/Dockerfile
@ -0,0 +1,16 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev ca-certificates \
+    gcc-mipsel-linux-gnu libc6-dev-mipsel-cross \
+    binfmt-support qemu-user-static
+
+ENV TOOLCHAIN_PREFIX=mipsel-linux-gnu-
+ENV CARGO_TARGET_MIPSEL_UNKNOWN_LINUX_GNU_LINKER="$TOOLCHAIN_PREFIX"gcc \
+    CARGO_TARGET_MIPSEL_UNKNOWN_LINUX_GNU_RUNNER=qemu-mipsel-static \
+    AR_mipsel_unknown_linux_gnu="$TOOLCHAIN_PREFIX"ar \
+    CC_mipsel_unknown_linux_gnu="$TOOLCHAIN_PREFIX"gcc \
+    QEMU_LD_PREFIX=/usr/mipsel-linux-gnu \
+    RUST_TEST_THREADS=1
--- a/library/compiler-builtins/ci/docker/powerpc-unknown-linux-gnu/Dockerfile
+++ b/library/compiler-builtins/ci/docker/powerpc-unknown-linux-gnu/Dockerfile
@ -0,0 +1,16 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev qemu-user-static ca-certificates \
+    gcc-powerpc-linux-gnu libc6-dev-powerpc-cross \
+    qemu-system-ppc
+
+ENV TOOLCHAIN_PREFIX=powerpc-linux-gnu-
+ENV CARGO_TARGET_POWERPC_UNKNOWN_LINUX_GNU_LINKER="$TOOLCHAIN_PREFIX"gcc \
+    CARGO_TARGET_POWERPC_UNKNOWN_LINUX_GNU_RUNNER=qemu-ppc-static \
+    AR_powerpc_unknown_linux_gnu="$TOOLCHAIN_PREFIX"ar \
+    CC_powerpc_unknown_linux_gnu="$TOOLCHAIN_PREFIX"gcc \
+    QEMU_LD_PREFIX=/usr/powerpc-linux-gnu \
+    RUST_TEST_THREADS=1
--- a/library/compiler-builtins/ci/docker/powerpc64-unknown-linux-gnu/Dockerfile
+++ b/library/compiler-builtins/ci/docker/powerpc64-unknown-linux-gnu/Dockerfile
@ -0,0 +1,16 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev ca-certificates \
+    gcc-powerpc64-linux-gnu libc6-dev-ppc64-cross \
+    binfmt-support qemu-user-static qemu-system-ppc
+
+ENV TOOLCHAIN_PREFIX=powerpc64-linux-gnu-
+ENV CARGO_TARGET_POWERPC64_UNKNOWN_LINUX_GNU_LINKER="$TOOLCHAIN_PREFIX"gcc \
+    CARGO_TARGET_POWERPC64_UNKNOWN_LINUX_GNU_RUNNER=qemu-ppc64-static \
+    AR_powerpc64_unknown_linux_gnu="$TOOLCHAIN_PREFIX"ar \
+    CC_powerpc64_unknown_linux_gnu="$TOOLCHAIN_PREFIX"gcc \
+    QEMU_LD_PREFIX=/usr/powerpc64-linux-gnu \
+    RUST_TEST_THREADS=1
--- a/library/compiler-builtins/ci/docker/powerpc64le-unknown-linux-gnu/Dockerfile
+++ b/library/compiler-builtins/ci/docker/powerpc64le-unknown-linux-gnu/Dockerfile
@ -0,0 +1,17 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev qemu-user-static ca-certificates \
+    gcc-powerpc64le-linux-gnu libc6-dev-ppc64el-cross \
+    qemu-system-ppc
+
+ENV TOOLCHAIN_PREFIX=powerpc64le-linux-gnu-
+ENV CARGO_TARGET_POWERPC64LE_UNKNOWN_LINUX_GNU_LINKER="$TOOLCHAIN_PREFIX"gcc \
+    CARGO_TARGET_POWERPC64LE_UNKNOWN_LINUX_GNU_RUNNER=qemu-ppc64le-static \
+    AR_powerpc64le_unknown_linux_gnu="$TOOLCHAIN_PREFIX"ar \
+    CC_powerpc64le_unknown_linux_gnu="$TOOLCHAIN_PREFIX"gcc \
+    QEMU_CPU=POWER8 \
+    QEMU_LD_PREFIX=/usr/powerpc64le-linux-gnu \
+    RUST_TEST_THREADS=1
--- a/library/compiler-builtins/ci/docker/riscv64gc-unknown-linux-gnu/Dockerfile
+++ b/library/compiler-builtins/ci/docker/riscv64gc-unknown-linux-gnu/Dockerfile
@ -0,0 +1,16 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev qemu-user-static ca-certificates \
+    gcc-riscv64-linux-gnu libc6-dev-riscv64-cross \
+    qemu-system-riscv64
+
+ENV TOOLCHAIN_PREFIX=riscv64-linux-gnu-
+ENV CARGO_TARGET_RISCV64GC_UNKNOWN_LINUX_GNU_LINKER="$TOOLCHAIN_PREFIX"gcc \
+    CARGO_TARGET_RISCV64GC_UNKNOWN_LINUX_GNU_RUNNER=qemu-riscv64-static \
+    AR_riscv64gc_unknown_linux_gnu="$TOOLCHAIN_PREFIX"ar \
+    CC_riscv64gc_unknown_linux_gnu="$TOOLCHAIN_PREFIX"gcc \
+    QEMU_LD_PREFIX=/usr/riscv64-linux-gnu \
+    RUST_TEST_THREADS=1
--- a/library/compiler-builtins/ci/docker/thumbv6m-none-eabi/Dockerfile
+++ b/library/compiler-builtins/ci/docker/thumbv6m-none-eabi/Dockerfile
@ -0,0 +1,9 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev ca-certificates \
+    gcc-arm-none-eabi \
+    libnewlib-arm-none-eabi
+ENV BUILD_ONLY=1
--- a/library/compiler-builtins/ci/docker/thumbv7em-none-eabi/Dockerfile
+++ b/library/compiler-builtins/ci/docker/thumbv7em-none-eabi/Dockerfile
@ -0,0 +1,9 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev ca-certificates \
+    gcc-arm-none-eabi \
+    libnewlib-arm-none-eabi
+ENV BUILD_ONLY=1
--- a/library/compiler-builtins/ci/docker/thumbv7em-none-eabihf/Dockerfile
+++ b/library/compiler-builtins/ci/docker/thumbv7em-none-eabihf/Dockerfile
@ -0,0 +1,9 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev ca-certificates \
+    gcc-arm-none-eabi \
+    libnewlib-arm-none-eabi
+ENV BUILD_ONLY=1
--- a/library/compiler-builtins/ci/docker/thumbv7m-none-eabi/Dockerfile
+++ b/library/compiler-builtins/ci/docker/thumbv7m-none-eabi/Dockerfile
@ -0,0 +1,9 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev ca-certificates \
+    gcc-arm-none-eabi \
+    libnewlib-arm-none-eabi
+ENV BUILD_ONLY=1
--- a/library/compiler-builtins/ci/docker/wasm32-unknown-unknown/Dockerfile
+++ b/library/compiler-builtins/ci/docker/wasm32-unknown-unknown/Dockerfile
@ -0,0 +1,8 @@
+ARG IMAGE=ubuntu:20.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc clang libc6-dev ca-certificates
+
+ENV CARGO_TARGET_WASM32_UNKNOWN_UNKNOWN_RUNNER=true
--- a/library/compiler-builtins/ci/docker/x86_64-unknown-linux-gnu/Dockerfile
+++ b/library/compiler-builtins/ci/docker/x86_64-unknown-linux-gnu/Dockerfile
@ -0,0 +1,6 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc m4 make libc6-dev ca-certificates
--- a/library/compiler-builtins/ci/download-compiler-rt.sh
+++ b/library/compiler-builtins/ci/download-compiler-rt.sh
@ -0,0 +1,10 @@
+#!/bin/sh
+# Download sources to build C versions of intrinsics. Once being run,
+# `RUST_COMPILER_RT_ROOT` must be set.
+
+set -eux
+
+rust_llvm_version=20.1-2025-02-13
+
+curl -L -o code.tar.gz "https://github.com/rust-lang/llvm-project/archive/rustc/${rust_llvm_version}.tar.gz"
+tar xzf code.tar.gz --strip-components 1 llvm-project-rustc-${rust_llvm_version}/compiler-rt
--- a/library/compiler-builtins/ci/miri.sh
+++ b/library/compiler-builtins/ci/miri.sh
@ -0,0 +1,18 @@
+#!/bin/bash
+set -eux
+
+# We need Tree Borrows as some of our raw pointer patterns are not
+# compatible with Stacked Borrows.
+export MIRIFLAGS="-Zmiri-tree-borrows"
+
+# One target that sets `mem-unaligned` and one that does not,
+# and a big-endian target.
+targets=(
+    x86_64-unknown-linux-gnu
+    armv7-unknown-linux-gnueabihf
+    s390x-unknown-linux-gnu
+)
+for target in "${targets[@]}"; do
+    # Only run the `mem` tests to avoid this taking too long.
+    cargo miri test --manifest-path builtins-test/Cargo.toml --features no-asm --target "$target" -- mem
+done
--- a/library/compiler-builtins/ci/run-docker.sh
+++ b/library/compiler-builtins/ci/run-docker.sh
@ -0,0 +1,111 @@
+#!/bin/bash
+
+# Small script to run tests for a target (or all targets) inside all the
+# respective docker images.
+
+set -euxo pipefail
+
+host_arch="$(uname -m | sed 's/arm64/aarch64/')"
+
+# Directories and files that do not yet exist need to be created before
+# calling docker, otherwise docker will create them but they will be owned
+# by root.
+mkdir -p target
+cargo generate-lockfile
+cargo generate-lockfile --manifest-path builtins-test-intrinsics/Cargo.toml
+
+run() {
+    local target="$1"
+
+    echo "testing target: $target"
+
+    emulated=""
+    target_arch="$(echo "$target" | cut -d'-' -f1)"
+    if [ "$target_arch" != "$host_arch" ]; then
+        emulated=1
+        echo "target is emulated"
+    fi
+
+    run_cmd="HOME=/tmp"
+
+    if [ "${GITHUB_ACTIONS:-}" = "true" ]; then
+        # Enable Docker image caching on GHA
+        build_cmd=("buildx" "build")
+        build_args=(
+            "--cache-from" "type=local,src=/tmp/.buildx-cache"
+            "--cache-to" "type=local,dest=/tmp/.buildx-cache-new"
+            # This is the beautiful bash syntax for expanding an array but neither
+            # raising an error nor returning an empty string if the array is empty.
+            "${build_args[@]:+"${build_args[@]}"}"
+            "--load"
+        )
+    fi
+
+    if [ "$(uname -s)" = "Linux" ] && [ -z "${DOCKER_BASE_IMAGE:-}" ]; then
+        # Share the host rustc and target. Do this only on Linux and if the image
+        # isn't overridden
+        run_args=(
+            --user "$(id -u):$(id -g)"
+            -e "CARGO_HOME=/cargo"
+            -v "${HOME}/.cargo:/cargo"
+            -v "$(pwd)/target:/builtins-target"
+            -v "$(rustc --print sysroot):/rust:ro"
+        )
+        run_cmd="$run_cmd PATH=\$PATH:/rust/bin:/cargo/bin"
+    else
+        # Use rustc provided by a docker image
+        docker volume create compiler-builtins-cache
+        build_args=(
+            "--build-arg"
+            "IMAGE=${DOCKER_BASE_IMAGE:-rustlang/rust:nightly}"
+        )
+        run_args=(-v "compiler-builtins-cache:/builtins-target")
+        run_cmd="$run_cmd HOME=/tmp" "USING_CONTAINER_RUSTC=1"
+    fi
+
+    if [ -d compiler-rt ]; then
+        export RUST_COMPILER_RT_ROOT="/checkout/compiler-rt"
+    fi
+
+    run_cmd="$run_cmd ci/run.sh $target"
+
+    docker "${build_cmd[@]:-build}" \
+        -t "builtins-$target" \
+        "${build_args[@]:-}" \
+        "ci/docker/$target"
+    docker run \
+        --rm \
+        -e CI \
+        -e CARGO_TARGET_DIR=/builtins-target \
+        -e CARGO_TERM_COLOR \
+        -e MAY_SKIP_LIBM_CI \
+        -e RUSTFLAGS \
+        -e RUST_BACKTRACE \
+        -e RUST_COMPILER_RT_ROOT \
+        -e "EMULATED=$emulated" \
+        -v "$(pwd):/checkout:ro" \
+        -w /checkout \
+        "${run_args[@]:-}" \
+        --init \
+        "builtins-$target" \
+        sh -c "$run_cmd"
+}
+
+if [ "${1:-}" = "--help" ] || [ "$#" -gt 1 ]; then
+    set +x
+    echo "\
+    usage: ./ci/run-docker.sh [target]
+
+    you can also set DOCKER_BASE_IMAGE to use something other than the default
+    ubuntu:24.04 (or rustlang/rust:nightly).
+    "
+    exit
+fi
+
+if [ -z "${1:-}" ]; then
+    for d in ci/docker/*; do
+        run $(basename "$d")
+    done
+else
+    run "$1"
+fi
--- a/library/compiler-builtins/ci/run-extensive.sh
+++ b/library/compiler-builtins/ci/run-extensive.sh
@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -euo pipefail
+
+echo "Tests to run: '$TO_TEST'"
+
+if [ -z "$TO_TEST" ]; then
+    echo "No tests to run, exiting."
+    exit
+fi
+
+set -x
+
+test_cmd=(
+    cargo test
+    --package libm-test
+    --features "build-mpfr,libm/unstable,libm/force-soft-floats"
+    --profile release-checked
+)
+
+# Run the non-extensive tests first to catch any easy failures
+"${test_cmd[@]}" -- "$TO_TEST"
+
+LIBM_EXTENSIVE_TESTS="$TO_TEST" "${test_cmd[@]}" -- extensive
--- a/library/compiler-builtins/ci/run.sh
+++ b/library/compiler-builtins/ci/run.sh
@ -0,0 +1,302 @@
+#!/bin/bash
+
+set -eux
+
+export RUST_BACKTRACE="${RUST_BACKTRACE:-full}"
+export NEXTEST_STATUS_LEVEL=all
+
+target="${1:-}"
+
+if [ -z "$target" ]; then
+    host_target=$(rustc -vV | awk '/^host/ { print $2 }')
+    echo "Defaulted to host target $host_target"
+    target="$host_target"
+fi
+
+if [[ "$target" = *"wasm"* ]]; then
+    # Enable the random backend
+    export RUSTFLAGS="${RUSTFLAGS:-} --cfg getrandom_backend=\"wasm_js\""
+fi
+
+if [ "${USING_CONTAINER_RUSTC:-}" = 1 ]; then
+    # Install nonstandard components if we have control of the environment
+    rustup target list --installed |
+        grep -E "^$target\$" ||
+        rustup target add "$target"
+fi
+
+# Test our implementation
+if [ "${BUILD_ONLY:-}" = "1" ]; then
+    echo "no tests to run for build-only targets"
+else
+    test_builtins=(cargo test --package builtins-test --no-fail-fast --target "$target")
+    "${test_builtins[@]}"
+    "${test_builtins[@]}" --release
+    "${test_builtins[@]}" --features c
+    "${test_builtins[@]}" --features c --release
+    "${test_builtins[@]}" --features no-asm
+    "${test_builtins[@]}" --features no-asm --release
+    "${test_builtins[@]}" --features no-f16-f128
+    "${test_builtins[@]}" --features no-f16-f128 --release
+    "${test_builtins[@]}" --benches
+    "${test_builtins[@]}" --benches --release
+
+    if [ "${TEST_VERBATIM:-}" = "1" ]; then
+        verb_path=$(cmd.exe //C echo \\\\?\\%cd%\\builtins-test\\target2)
+        "${test_builtins[@]}" --target-dir "$verb_path" --features c
+    fi
+fi
+
+
+declare -a rlib_paths
+
+# Set the `rlib_paths` global array to a list of all compiler-builtins rlibs
+update_rlib_paths() {
+    if [ -d /builtins-target ]; then
+        rlib_paths=( /builtins-target/"${target}"/debug/deps/libcompiler_builtins-*.rlib )
+    else
+        rlib_paths=( target/"${target}"/debug/deps/libcompiler_builtins-*.rlib )
+    fi
+}
+
+# Remove any existing artifacts from previous tests that don't set #![compiler_builtins]
+update_rlib_paths
+rm -f "${rlib_paths[@]}"
+
+cargo build -p compiler_builtins --target "$target"
+cargo build -p compiler_builtins --target "$target" --release
+cargo build -p compiler_builtins --target "$target" --features c
+cargo build -p compiler_builtins --target "$target" --features c --release
+cargo build -p compiler_builtins --target "$target" --features no-asm
+cargo build -p compiler_builtins --target "$target" --features no-asm --release
+cargo build -p compiler_builtins --target "$target" --features no-f16-f128
+cargo build -p compiler_builtins --target "$target" --features no-f16-f128 --release
+
+PREFIX=${target//unknown-/}-
+case "$target" in
+    armv7-*)
+        PREFIX=arm-linux-gnueabihf-
+        ;;
+    thumb*)
+        PREFIX=arm-none-eabi-
+        ;;
+    *86*-*)
+        PREFIX=
+        ;;
+esac
+
+NM=$(find "$(rustc --print sysroot)" \( -name llvm-nm -o -name llvm-nm.exe \) )
+if [ "$NM" = "" ]; then
+  NM="${PREFIX}nm"
+fi
+
+# i686-pc-windows-gnu tools have a dependency on some DLLs, so run it with
+# rustup run to ensure that those are in PATH.
+TOOLCHAIN="$(rustup show active-toolchain | sed 's/ (default)//')"
+if [[ "$TOOLCHAIN" == *i686-pc-windows-gnu ]]; then
+  NM="rustup run $TOOLCHAIN $NM"
+fi
+
+# Look out for duplicated symbols when we include the compiler-rt (C) implementation
+update_rlib_paths
+for rlib in "${rlib_paths[@]}"; do
+    set +x
+    echo "================================================================"
+    echo "checking $rlib for duplicate symbols"
+    echo "================================================================"
+    set -x
+    
+    duplicates_found=0
+
+    # NOTE On i586, It's normal that the get_pc_thunk symbol appears several
+    # times so ignore it
+    $NM -g --defined-only "$rlib" 2>&1 |
+      sort |
+      uniq -d |
+      grep -v __x86.get_pc_thunk --quiet |
+      grep 'T __' && duplicates_found=1
+
+    if [ "$duplicates_found" != 0 ]; then
+        echo "error: found duplicate symbols"
+        exit 1
+    else
+        echo "success; no duplicate symbols found"
+    fi
+done
+
+rm -f "${rlib_paths[@]}"
+
+build_intrinsics_test() {
+    cargo build \
+        --target "$target" --verbose \
+        --manifest-path builtins-test-intrinsics/Cargo.toml "$@"
+}
+
+# Verify that we haven't dropped any intrinsics/symbols
+build_intrinsics_test
+build_intrinsics_test --release
+build_intrinsics_test --features c
+build_intrinsics_test --features c --release
+
+# Verify that there are no undefined symbols to `panic` within our
+# implementations
+CARGO_PROFILE_DEV_LTO=true build_intrinsics_test
+CARGO_PROFILE_RELEASE_LTO=true build_intrinsics_test --release
+
+# Ensure no references to any symbols from core
+update_rlib_paths
+for rlib in "${rlib_paths[@]}"; do
+    set +x
+    echo "================================================================"
+    echo "checking $rlib for references to core"
+    echo "================================================================"
+    set -x
+
+    tmpdir="${CARGO_TARGET_DIR:-target}/tmp"
+    test -d "$tmpdir" || mkdir "$tmpdir"
+    defined="$tmpdir/defined_symbols.txt"
+    undefined="$tmpdir/defined_symbols.txt"
+
+    $NM --quiet -U "$rlib" | grep 'T _ZN4core' | awk '{print $3}' | sort | uniq > "$defined"
+    $NM --quiet -u "$rlib" | grep 'U _ZN4core' | awk '{print $2}' | sort | uniq > "$undefined"
+    grep_has_results=0
+    grep -v -F -x -f "$defined" "$undefined" && grep_has_results=1
+
+    if [ "$target" = "powerpc64-unknown-linux-gnu" ]; then
+        echo "FIXME: powerpc64 fails these tests"
+    elif [ "$grep_has_results" != 0 ]; then
+        echo "error: found unexpected references to core"
+        exit 1
+    else
+        echo "success; no references to core found"
+    fi
+done
+
+# Test libm
+
+# Make sure a simple build works
+cargo check -p libm --no-default-features --target "$target"
+
+if [ "${MAY_SKIP_LIBM_CI:-}" = "true" ]; then
+    echo "skipping libm PR CI"
+    exit
+fi
+
+mflags=()
+
+# We enumerate features manually.
+mflags+=(--no-default-features)
+
+# Enable arch-specific routines when available.
+mflags+=(--features arch)
+
+# Always enable `unstable-float` since it expands available API but does not
+# change any implementations.
+mflags+=(--features unstable-float)
+
+# We need to specifically skip tests for musl-math-sys on systems that can't
+# build musl since otherwise `--all` will activate it.
+case "$target" in
+    # Can't build at all on MSVC, WASM, or thumb
+    *windows-msvc*) mflags+=(--exclude musl-math-sys) ;;
+    *wasm*) mflags+=(--exclude musl-math-sys) ;;
+    *thumb*) mflags+=(--exclude musl-math-sys) ;;
+
+    # We can build musl on MinGW but running tests gets a stack overflow
+    *windows-gnu*) ;;
+    # FIXME(#309): LE PPC crashes calling the musl version of some functions. It
+    # seems like a qemu bug but should be investigated further at some point.
+    # See <https://github.com/rust-lang/libm/issues/309>.
+    *powerpc64le*) ;;
+
+    # Everything else gets musl enabled
+    *) mflags+=(--features libm-test/build-musl) ;;
+esac
+
+
+# Configure which targets test against MPFR
+case "$target" in
+    # MSVC cannot link MPFR
+    *windows-msvc*) ;;
+    # FIXME: MinGW should be able to build MPFR, but setup in CI is nontrivial.
+    *windows-gnu*) ;;
+    # Targets that aren't cross compiled in CI work fine
+    aarch64*apple*) mflags+=(--features libm-test/build-mpfr) ;;
+    aarch64*linux*) mflags+=(--features libm-test/build-mpfr) ;;
+    i586*) mflags+=(--features libm-test/build-mpfr --features gmp-mpfr-sys/force-cross) ;;
+    i686*) mflags+=(--features libm-test/build-mpfr) ;;
+    x86_64*) mflags+=(--features libm-test/build-mpfr) ;;
+esac
+
+# FIXME: `STATUS_DLL_NOT_FOUND` testing macros on CI.
+# <https://github.com/rust-lang/rust/issues/128944>
+case "$target" in
+    *windows-gnu) mflags+=(--exclude libm-macros) ;;
+esac
+
+if [ "${BUILD_ONLY:-}" = "1" ]; then
+    # If we are on targets that can't run tests, verify that we can build.
+    cmd=(cargo build --target "$target" --package libm)
+    "${cmd[@]}"
+    "${cmd[@]}" --features unstable-intrinsics
+
+    echo "can't run tests on $target; skipping"
+else
+    mflags+=(--workspace --target "$target")
+    cmd=(cargo test "${mflags[@]}")
+    profile_flag="--profile"
+    
+    # If nextest is available, use that
+    command -v cargo-nextest && nextest=1 || nextest=0
+    if [ "$nextest" = "1" ]; then
+        cmd=(cargo nextest run --max-fail=10)
+
+        # Workaround for https://github.com/nextest-rs/nextest/issues/2066
+        if [ -f /.dockerenv ]; then
+            cfg_file="/tmp/nextest-config.toml"
+            echo "[store]" >> "$cfg_file"
+            echo "dir = \"$CARGO_TARGET_DIR/nextest\"" >> "$cfg_file"
+            cmd+=(--config-file "$cfg_file")
+        fi
+
+        # Not all configurations have tests to run on wasm
+        [[ "$target" = *"wasm"* ]] && cmd+=(--no-tests=warn)
+
+        cmd+=("${mflags[@]}")
+        profile_flag="--cargo-profile"
+    fi
+
+    # Test once without intrinsics
+    "${cmd[@]}"
+
+    # Run doctests if they were excluded by nextest
+    [ "$nextest" = "1" ] && cargo test --doc --exclude compiler_builtins "${mflags[@]}"
+
+    # Exclude the macros and utile crates from the rest of the tests to save CI
+    # runtime, they shouldn't have anything feature- or opt-level-dependent.
+    cmd+=(--exclude util --exclude libm-macros)
+
+    # Test once with intrinsics enabled
+    "${cmd[@]}" --features unstable-intrinsics
+    "${cmd[@]}" --features unstable-intrinsics --benches
+
+    # Test the same in release mode, which also increases coverage. Also ensure
+    # the soft float routines are checked.
+    "${cmd[@]}" "$profile_flag" release-checked
+    "${cmd[@]}" "$profile_flag" release-checked --features force-soft-floats
+    "${cmd[@]}" "$profile_flag" release-checked --features unstable-intrinsics
+    "${cmd[@]}" "$profile_flag" release-checked --features unstable-intrinsics --benches
+
+    # Ensure that the routines do not panic.
+    # 
+    # `--tests` must be passed because no-panic is only enabled as a dev
+    # dependency. The `release-opt` profile must be used to enable LTO and a
+    # single CGU.
+    ENSURE_NO_PANIC=1 cargo build \
+        -p libm \
+        --target "$target" \
+        --no-default-features \
+        --features unstable-float \
+        --tests \
+        --profile release-opt
+fi
--- a/library/compiler-builtins/compiler-builtins/CHANGELOG.md
+++ b/library/compiler-builtins/compiler-builtins/CHANGELOG.md
@ -0,0 +1,168 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [Unreleased]
+
+## [0.1.159](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.158...compiler_builtins-v0.1.159) - 2025-05-12
+
+### Other
+
+- Remove cfg(bootstrap)
+
+## [0.1.158](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.157...compiler_builtins-v0.1.158) - 2025-05-06
+
+### Other
+
+- Require `target_has_atomic = "ptr"` for runtime feature detection
+
+## [0.1.157](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.156...compiler_builtins-v0.1.157) - 2025-05-03
+
+### Other
+
+- Use runtime feature detection for fma routines on x86
+
+## [0.1.156](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.155...compiler_builtins-v0.1.156) - 2025-04-21
+
+### Other
+
+- avr: Provide `abort()`
+- Remove `unsafe` from `naked_asm!` blocks
+- Enable icount benchmarks in CI
+- Move builtins-test-intrinsics out of the workspace
+- Run `cargo fmt` on all projects
+- Flatten the `libm/libm` directory
+- Update path to libm after the merge
+
+## [0.1.155](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.154...compiler_builtins-v0.1.155) - 2025-04-17
+
+### Other
+
+- use `#[cfg(bootstrap)]` for rustc sync
+- Replace the `bl!` macro with `asm_sym`
+- __udivmod(h|q)i4
+
+## [0.1.154](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.153...compiler_builtins-v0.1.154) - 2025-04-16
+
+### Other
+
+- turn #[naked] into an unsafe attribute
+
+## [0.1.153](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.152...compiler_builtins-v0.1.153) - 2025-04-09
+
+### Other
+
+- Remove a mention of `force-soft-float` in `build.rs`
+- Revert "Disable `f16` on AArch64 without the `neon` feature"
+- Skip No More!
+- avoid out-of-bounds accesses ([#799](https://github.com/rust-lang/compiler-builtins/pull/799))
+
+## [0.1.152](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.151...compiler_builtins-v0.1.152) - 2025-03-20
+
+### Other
+
+- Remove use of `atomic_load_unordered` and undefined behaviour from `arm_linux.rs`
+- Switch repository layout to use a virtual manifest
+
+## [0.1.151](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.150...compiler_builtins-v0.1.151) - 2025-03-05
+
+### Other
+
+- Add cygwin support
+- Enable `f16` for LoongArch ([#770](https://github.com/rust-lang/compiler-builtins/pull/770))
+- Add __extendhfdf2 and add __truncdfhf2 test
+- Remove outdated information from the readme
+
+## [0.1.150](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.149...compiler_builtins-v0.1.150) - 2025-03-01
+
+### Other
+
+- Disable `f16` on AArch64 without the `neon` feature
+- Update LLVM downloads to 20.1-2025-02-13
+
+## [0.1.149](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.148...compiler_builtins-v0.1.149) - 2025-02-25
+
+### Other
+
+- Make a subset of `libm` symbols weakly available on all platforms
+
+## [0.1.148](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.147...compiler_builtins-v0.1.148) - 2025-02-24
+
+### Other
+
+- Update the `libm` submodule
+- Enable `f16` for MIPS
+- Eliminate the use of `public_test_dep!` for a third time
+
+## [0.1.147](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.146...compiler_builtins-v0.1.147) - 2025-02-19
+
+### Other
+
+- remove win64_128bit_abi_hack
+
+## [0.1.146](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.145...compiler_builtins-v0.1.146) - 2025-02-06
+
+### Other
+
+- Expose erf{,c}{,f} from libm
+
+## [0.1.145](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.144...compiler_builtins-v0.1.145) - 2025-02-04
+
+### Other
+
+- Revert "Eliminate the use of `public_test_dep!`"
+- Indentation fix to please clippy
+- Don't build out of line atomics support code for uefi
+- Add a version to some FIXMEs that will be resolved in LLVM 20
+- Remove use of the `start` feature
+
+## [0.1.144](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.143...compiler_builtins-v0.1.144) - 2025-01-15
+
+### Other
+
+- Eliminate the use of `public_test_dep!`
+
+## [0.1.143](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.142...compiler_builtins-v0.1.143) - 2025-01-15
+
+### Other
+
+- Use a C-safe return type for `__rust_[ui]128_*` overflowing intrinsics
+
+## [0.1.142](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.141...compiler_builtins-v0.1.142) - 2025-01-07
+
+### Other
+
+- Account for optimization levels other than numbers
+
+## [0.1.141](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.140...compiler_builtins-v0.1.141) - 2025-01-07
+
+### Other
+
+- Update the `libm` submodule
+- Fix new `clippy::precedence` errors
+- Rename `EXP_MAX` to `EXP_SAT`
+- Shorten prefixes for float constants
+
+## [0.1.140](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.139...compiler_builtins-v0.1.140) - 2024-12-26
+
+### Other
+
+- Disable f128 for amdgpu ([#737](https://github.com/rust-lang/compiler-builtins/pull/737))
+- Fix a bug in `abs_diff`
+- Disable `f16` on platforms that have recursion problems
+
+## [0.1.139](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.138...compiler_builtins-v0.1.139) - 2024-11-03
+
+### Other
+
+- Remove incorrect `sparcv9` match pattern from `configure_f16_f128`
+
+## [0.1.138](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.137...compiler_builtins-v0.1.138) - 2024-11-01
+
+### Other
+
+- Use `f16_enabled`/`f128_enabled` in `examples/intrinsics.rs` ([#724](https://github.com/rust-lang/compiler-builtins/pull/724))
+- Disable `f16` for LoongArch64 ([#722](https://github.com/rust-lang/compiler-builtins/pull/722))
--- a/library/compiler-builtins/compiler-builtins/Cargo.toml
+++ b/library/compiler-builtins/compiler-builtins/Cargo.toml
@ -0,0 +1,64 @@
+[package]
+authors = ["Jorge Aparicio <japaricious@gmail.com>"]
+name = "compiler_builtins"
+version = "0.1.159"
+license = "MIT AND Apache-2.0 WITH LLVM-exception AND (MIT OR Apache-2.0)"
+readme = "README.md"
+repository = "https://github.com/rust-lang/compiler-builtins"
+homepage = "https://github.com/rust-lang/compiler-builtins"
+documentation = "https://docs.rs/compiler_builtins"
+edition = "2021"
+description = "Compiler intrinsics used by the Rust compiler."
+links = "compiler-rt"
+
+[lib]
+bench = false
+doctest = false
+test = false
+
+[dependencies]
+# For more information on this dependency see
+# https://github.com/rust-lang/rust/tree/master/library/rustc-std-workspace-core
+core = { version = "1.0.0", optional = true, package = "rustc-std-workspace-core" }
+
+[build-dependencies]
+cc = { optional = true, version = "1.0" }
+
+[dev-dependencies]
+panic-handler = { path = "../crates/panic-handler" }
+
+[features]
+default = ["compiler-builtins"]
+
+# Enable compilation of C code in compiler-rt, filling in some more optimized
+# implementations and also filling in unimplemented intrinsics
+c = ["dep:cc"]
+
+# Workaround for the Cranelift codegen backend. Disables any implementations
+# which use inline assembly and fall back to pure Rust versions (if available).
+no-asm = []
+
+# Workaround for codegen backends which haven't yet implemented `f16` and
+# `f128` support. Disabled any intrinsics which use those types.
+no-f16-f128 = []
+
+# Flag this library as the unstable compiler-builtins lib
+compiler-builtins = []
+
+# Generate memory-related intrinsics like memcpy
+mem = []
+
+# Mangle all names so this can be linked in with other versions or other
+# compiler-rt implementations. Also used for testing
+mangled-names = []
+
+# Only used in the compiler's build system
+rustc-dep-of-std = ["compiler-builtins", "dep:core"]
+
+# This makes certain traits and function specializations public that
+# are not normally public but are required by the `builtins-test`
+unstable-public-internals = []
+
+[lints.rust]
+# The cygwin config can be dropped after our benchmark toolchain is bumped
+unexpected_cfgs = { level = "warn", check-cfg = ['cfg(bootstrap)', 'cfg(target_os, values("cygwin"))'] }
--- a/library/compiler-builtins/compiler-builtins/LICENSE.txt
+++ b/library/compiler-builtins/compiler-builtins/LICENSE.txt
@ -0,0 +1 @@
+../LICENSE.txt
--- a/library/compiler-builtins/compiler-builtins/README.md
+++ b/library/compiler-builtins/compiler-builtins/README.md
@ -0,0 +1,436 @@
+# `compiler-builtins`
+
+This crate provides external symbols that the compiler expects to be available
+when building Rust projects, typically software routines for basic operations
+that do not have hardware support. It is largely a port of LLVM's
+[`compiler-rt`].
+
+It is distributed as part of Rust's sysroot. `compiler-builtins` does not need
+to be added as an explicit dependency in `Cargo.toml`.
+
+[`compiler-rt`]: https://github.com/llvm/llvm-project/tree/1b1dc505057322f4fa1110ef4f53c44347f52986/compiler-rt
+
+## Contributing
+
+See [CONTRIBUTING.md](CONTRIBUTING.md).
+
+## Progress
+
+- [x] aarch64/chkstk.S
+- [x] adddf3.c
+- [x] addsf3.c
+- [x] arm/addsf3.S
+- [x] arm/aeabi_dcmp.S
+- [x] arm/aeabi_fcmp.S
+- [x] arm/aeabi_idivmod.S
+- [x] arm/aeabi_ldivmod.S
+- [x] arm/aeabi_memcpy.S
+- [x] arm/aeabi_memmove.S
+- [x] arm/aeabi_memset.S
+- [x] arm/aeabi_uidivmod.S
+- [x] arm/aeabi_uldivmod.S
+- [ ] arm/chkstk.S
+- [ ] arm/divmodsi4.S (generic version is done)
+- [ ] arm/divsi3.S (generic version is done)
+- [ ] arm/modsi3.S (generic version is done)
+- [x] arm/softfloat-alias.list
+- [ ] arm/udivmodsi4.S (generic version is done)
+- [ ] arm/udivsi3.S (generic version is done)
+- [ ] arm/umodsi3.S (generic version is done)
+- [x] ashldi3.c
+- [x] ashrdi3.c
+- [ ] avr/divmodhi4.S
+- [ ] avr/divmodqi4.S
+- [ ] avr/mulhi3.S
+- [ ] avr/mulqi3.S
+- [ ] avr/udivmodhi4.S
+- [ ] avr/udivmodqi4.S
+- [x] bswapdi2.c
+- [x] bswapsi2.c
+- [x] bswapti2.c
+- [x] clzdi2.c
+- [x] clzsi2.c
+- [x] clzti2.c
+- [x] comparedf2.c
+- [x] comparesf2.c
+- [x] ctzdi2.c
+- [x] ctzsi2.c
+- [x] ctzti2.c
+- [x] divdf3.c
+- [x] divdi3.c
+- [x] divmoddi4.c
+- [x] divmodsi4.c
+- [x] divmodti4.c
+- [x] divsf3.c
+- [x] divsi3.c
+- [x] extendsfdf2.c
+- [x] fixdfdi.c
+- [x] fixdfsi.c
+- [x] fixsfdi.c
+- [x] fixsfsi.c
+- [x] fixunsdfdi.c
+- [x] fixunsdfsi.c
+- [x] fixunssfdi.c
+- [x] fixunssfsi.c
+- [x] floatdidf.c
+- [x] floatdisf.c
+- [x] floatsidf.c
+- [x] floatsisf.c
+- [x] floatundidf.c
+- [x] floatundisf.c
+- [x] floatunsidf.c
+- [x] floatunsisf.c
+- [ ] i386/ashldi3.S
+- [ ] i386/ashrdi3.S
+- [x] i386/chkstk.S
+- [ ] i386/divdi3.S
+- [ ] i386/lshrdi3.S
+- [ ] i386/moddi3.S
+- [ ] i386/muldi3.S
+- [ ] i386/udivdi3.S
+- [ ] i386/umoddi3.S
+- [x] lshrdi3.c
+- [x] moddi3.c
+- [x] modsi3.c
+- [x] muldf3.c
+- [x] muldi3.c
+- [x] mulodi4.c
+- [x] mulosi4.c
+- [x] mulsf3.c
+- [x] powidf2.c
+- [x] powisf2.c
+- [ ] riscv/muldi3.S
+- [ ] riscv/mulsi3.S
+- [x] subdf3.c
+- [x] subsf3.c
+- [x] truncdfsf2.c
+- [x] udivdi3.c
+- [x] udivmoddi4.c
+- [x] udivmodsi4.c
+- [x] udivsi3.c
+- [x] umoddi3.c
+- [x] umodsi3.c
+- [x] x86_64/chkstk.S
+
+These builtins are needed to support 128-bit integers.
+
+- [x] ashlti3.c
+- [x] ashrti3.c
+- [x] divti3.c
+- [x] fixdfti.c
+- [x] fixsfti.c
+- [x] fixunsdfti.c
+- [x] fixunssfti.c
+- [x] floattidf.c
+- [x] floattisf.c
+- [x] floatuntidf.c
+- [x] floatuntisf.c
+- [x] lshrti3.c
+- [x] modti3.c
+- [x] muloti4.c
+- [x] multi3.c
+- [x] udivmodti4.c
+- [x] udivti3.c
+- [x] umodti3.c
+
+These builtins are needed to support `f16` and `f128`, which are in the process
+of being added to Rust.
+
+- [x] addtf3.c
+- [x] comparetf2.c
+- [x] divtf3.c
+- [x] extenddftf2.c
+- [x] extendhfsf2.c
+- [x] extendhftf2.c
+- [x] extendsftf2.c
+- [x] fixtfdi.c
+- [x] fixtfsi.c
+- [x] fixtfti.c
+- [x] fixunstfdi.c
+- [x] fixunstfsi.c
+- [x] fixunstfti.c
+- [x] floatditf.c
+- [x] floatsitf.c
+- [x] floattitf.c
+- [x] floatunditf.c
+- [x] floatunsitf.c
+- [x] floatuntitf.c
+- [x] multf3.c
+- [x] powitf2.c
+- [x] subtf3.c
+- [x] truncdfhf2.c
+- [x] truncsfhf2.c
+- [x] trunctfdf2.c
+- [x] trunctfhf2.c
+- [x] trunctfsf2.c
+
+
+These builtins are used by the Hexagon DSP
+
+- [ ] hexagon/common_entry_exit_abi1.S
+- [ ] hexagon/common_entry_exit_abi2.S
+- [ ] hexagon/common_entry_exit_legacy.S
+- [x] hexagon/dfaddsub.S~~
+- [x] hexagon/dfdiv.S~~
+- [x] hexagon/dffma.S~~
+- [x] hexagon/dfminmax.S~~
+- [x] hexagon/dfmul.S~~
+- [x] hexagon/dfsqrt.S~~
+- [x] hexagon/divdi3.S~~
+- [x] hexagon/divsi3.S~~
+- [x] hexagon/fastmath2_dlib_asm.S~~
+- [x] hexagon/fastmath2_ldlib_asm.S~~
+- [x] hexagon/fastmath_dlib_asm.S~~
+- [x] hexagon/memcpy_forward_vp4cp4n2.S~~
+- [x] hexagon/memcpy_likely_aligned.S~~
+- [x] hexagon/moddi3.S~~
+- [x] hexagon/modsi3.S~~
+- [x] hexagon/sfdiv_opt.S~~
+- [x] hexagon/sfsqrt_opt.S~~
+- [x] hexagon/udivdi3.S~~
+- [x] hexagon/udivmoddi4.S~~
+- [x] hexagon/udivmodsi4.S~~
+- [x] hexagon/udivsi3.S~~
+- [x] hexagon/umoddi3.S~~
+- [x] hexagon/umodsi3.S~~
+
+## Unimplemented functions
+
+These builtins are for x87 `f80` floating-point numbers that are not supported
+by Rust.
+
+- ~~extendxftf2.c~~
+- ~~fixunsxfdi.c~~
+- ~~fixunsxfsi.c~~
+- ~~fixunsxfti.c~~
+- ~~fixxfdi.c~~
+- ~~fixxfti.c~~
+- ~~floatdixf.c~~
+- ~~floattixf.c~~
+- ~~floatundixf.c~~
+- ~~floatuntixf.c~~
+- ~~i386/floatdixf.S~~
+- ~~i386/floatundixf.S~~
+- ~~x86_64/floatdixf.c~~
+- ~~x86_64/floatundixf.S~~
+
+These builtins are for IBM "extended double" non-IEEE 128-bit floating-point
+numbers.
+
+- ~~ppc/divtc3.c~~
+- ~~ppc/fixtfdi.c~~
+- ~~ppc/fixtfti.c~~
+- ~~ppc/fixunstfdi.c~~
+- ~~ppc/fixunstfti.c~~
+- ~~ppc/floatditf.c~~
+- ~~ppc/floattitf.c~~
+- ~~ppc/floatunditf.c~~
+- ~~ppc/gcc_qadd.c~~
+- ~~ppc/gcc_qdiv.c~~
+- ~~ppc/gcc_qmul.c~~
+- ~~ppc/gcc_qsub.c~~
+- ~~ppc/multc3.c~~
+
+These builtins are for 16-bit brain floating-point numbers that are not
+supported by Rust.
+
+- ~~truncdfbf2.c~~
+- ~~truncsfbf2.c~~
+- ~~trunctfxf2.c~~
+
+These builtins involve complex floating-point types that are not supported by
+Rust.
+
+- ~~divdc3.c~~
+- ~~divsc3.c~~
+- ~~divtc3.c~~
+- ~~divxc3.c~~
+- ~~muldc3.c~~
+- ~~mulsc3.c~~
+- ~~multc3.c~~
+- ~~mulxc3.c~~
+- ~~powixf2.c~~
+
+These builtins are never called by LLVM.
+
+- ~~absvdi2.c~~
+- ~~absvsi2.c~~
+- ~~absvti2.c~~
+- ~~addvdi3.c~~
+- ~~addvsi3.c~~
+- ~~addvti3.c~~
+- ~~arm/aeabi_cdcmp.S~~
+- ~~arm/aeabi_cdcmpeq_check_nan.c~~
+- ~~arm/aeabi_cfcmp.S~~
+- ~~arm/aeabi_cfcmpeq_check_nan.c~~
+- ~~arm/aeabi_div0.c~~
+- ~~arm/aeabi_drsub.c~~
+- ~~arm/aeabi_frsub.c~~
+- ~~arm/aeabi_memcmp.S~~
+- ~~arm/bswapdi2.S~~
+- ~~arm/bswapsi2.S~~
+- ~~arm/clzdi2.S~~
+- ~~arm/clzsi2.S~~
+- ~~arm/comparesf2.S~~
+- ~~arm/restore_vfp_d8_d15_regs.S~~
+- ~~arm/save_vfp_d8_d15_regs.S~~
+- ~~arm/switch16.S~~
+- ~~arm/switch32.S~~
+- ~~arm/switch8.S~~
+- ~~arm/switchu8.S~~
+- ~~cmpdi2.c~~
+- ~~cmpti2.c~~
+- ~~ffssi2.c~~
+- ~~ffsdi2.c~~ - this is [called by gcc][jemalloc-fail] though!
+- ~~ffsti2.c~~
+- ~~mulvdi3.c~~
+- ~~mulvsi3.c~~
+- ~~mulvti3.c~~
+- ~~negdf2.c~~
+- ~~negdi2.c~~
+- ~~negsf2.c~~
+- ~~negti2.c~~
+- ~~negvdi2.c~~
+- ~~negvsi2.c~~
+- ~~negvti2.c~~
+- ~~paritydi2.c~~
+- ~~paritysi2.c~~
+- ~~parityti2.c~~
+- ~~popcountdi2.c~~
+- ~~popcountsi2.c~~
+- ~~popcountti2.c~~
+- ~~ppc/restFP.S~~
+- ~~ppc/saveFP.S~~
+- ~~subvdi3.c~~
+- ~~subvsi3.c~~
+- ~~subvti3.c~~
+- ~~ucmpdi2.c~~
+- ~~ucmpti2.c~~
+- ~~udivmodti4.c~~
+
+[jemalloc-fail]: https://travis-ci.org/rust-lang/rust/jobs/249772758
+
+Rust only exposes atomic types on platforms that support them, and therefore does not need to fall back to software implementations.
+
+- ~~arm/sync_fetch_and_add_4.S~~
+- ~~arm/sync_fetch_and_add_8.S~~
+- ~~arm/sync_fetch_and_and_4.S~~
+- ~~arm/sync_fetch_and_and_8.S~~
+- ~~arm/sync_fetch_and_max_4.S~~
+- ~~arm/sync_fetch_and_max_8.S~~
+- ~~arm/sync_fetch_and_min_4.S~~
+- ~~arm/sync_fetch_and_min_8.S~~
+- ~~arm/sync_fetch_and_nand_4.S~~
+- ~~arm/sync_fetch_and_nand_8.S~~
+- ~~arm/sync_fetch_and_or_4.S~~
+- ~~arm/sync_fetch_and_or_8.S~~
+- ~~arm/sync_fetch_and_sub_4.S~~
+- ~~arm/sync_fetch_and_sub_8.S~~
+- ~~arm/sync_fetch_and_umax_4.S~~
+- ~~arm/sync_fetch_and_umax_8.S~~
+- ~~arm/sync_fetch_and_umin_4.S~~
+- ~~arm/sync_fetch_and_umin_8.S~~
+- ~~arm/sync_fetch_and_xor_4.S~~
+- ~~arm/sync_fetch_and_xor_8.S~~
+- ~~arm/sync_synchronize.S~~
+- ~~atomic.c~~
+- ~~atomic_flag_clear.c~~
+- ~~atomic_flag_clear_explicit.c~~
+- ~~atomic_flag_test_and_set.c~~
+- ~~atomic_flag_test_and_set_explicit.c~~
+- ~~atomic_signal_fence.c~~
+- ~~atomic_thread_fence.c~~
+
+Miscellaneous functionality that is not used by Rust.
+
+- ~~aarch64/fp_mode.c~~
+- ~~aarch64/lse.S~~ (LSE atomics)
+- ~~aarch64/sme-abi-init.c~~ (matrix extension)
+- ~~aarch64/sme-abi.S~~ (matrix extension)
+- ~~aarch64/sme-libc-routines.c~~ (matrix extension)
+- ~~apple_versioning.c~~
+- ~~arm/fp_mode.c~~
+- ~~avr/exit.S~~
+- ~~clear_cache.c~~
+- ~~cpu_model/aarch64.c~~
+- ~~cpu_model/x86.c~~
+- ~~crtbegin.c~~
+- ~~crtend.c~~
+- ~~emutls.c~~
+- ~~enable_execute_stack.c~~
+- ~~eprintf.c~~
+- ~~fp_mode.c~~ (float exception handling)
+- ~~gcc_personality_v0.c~~
+- ~~i386/fp_mode.c~~
+- ~~int_util.c~~
+- ~~loongarch/fp_mode.c~~
+- ~~os_version_check.c~~
+- ~~riscv/fp_mode.c~~
+- ~~riscv/restore.S~~ (callee-saved registers)
+- ~~riscv/save.S~~ (callee-saved registers)
+- ~~trampoline_setup.c~~
+- ~~ve/grow_stack.S~~
+- ~~ve/grow_stack_align.S~~
+
+Floating-point implementations of builtins that are only called from soft-float code. It would be better to simply use the generic soft-float versions in this case.
+
+- ~~i386/floatdidf.S~~
+- ~~i386/floatdisf.S~~
+- ~~i386/floatundidf.S~~
+- ~~i386/floatundisf.S~~
+- ~~x86_64/floatundidf.S~~
+- ~~x86_64/floatundisf.S~~
+- ~~x86_64/floatdidf.c~~
+- ~~x86_64/floatdisf.c~~
+
+Unsupported in any current target: used on old versions of 32-bit iOS with ARMv5.
+
+- ~~arm/adddf3vfp.S~~
+- ~~arm/addsf3vfp.S~~
+- ~~arm/divdf3vfp.S~~
+- ~~arm/divsf3vfp.S~~
+- ~~arm/eqdf2vfp.S~~
+- ~~arm/eqsf2vfp.S~~
+- ~~arm/extendsfdf2vfp.S~~
+- ~~arm/fixdfsivfp.S~~
+- ~~arm/fixsfsivfp.S~~
+- ~~arm/fixunsdfsivfp.S~~
+- ~~arm/fixunssfsivfp.S~~
+- ~~arm/floatsidfvfp.S~~
+- ~~arm/floatsisfvfp.S~~
+- ~~arm/floatunssidfvfp.S~~
+- ~~arm/floatunssisfvfp.S~~
+- ~~arm/gedf2vfp.S~~
+- ~~arm/gesf2vfp.S~~
+- ~~arm/gtdf2vfp.S~~
+- ~~arm/gtsf2vfp.S~~
+- ~~arm/ledf2vfp.S~~
+- ~~arm/lesf2vfp.S~~
+- ~~arm/ltdf2vfp.S~~
+- ~~arm/ltsf2vfp.S~~
+- ~~arm/muldf3vfp.S~~
+- ~~arm/mulsf3vfp.S~~
+- ~~arm/nedf2vfp.S~~
+- ~~arm/negdf2vfp.S~~
+- ~~arm/negsf2vfp.S~~
+- ~~arm/nesf2vfp.S~~
+- ~~arm/subdf3vfp.S~~
+- ~~arm/subsf3vfp.S~~
+- ~~arm/truncdfsf2vfp.S~~
+- ~~arm/unorddf2vfp.S~~
+- ~~arm/unordsf2vfp.S~~
+
+## License
+
+Usage is allowed under the [MIT License] and the [Apache License, Version 2.0]
+with the LLVM exception.
+
+[MIT License]: https://opensource.org/license/mit
+[Apache License, Version 2.0]: htps://www.apache.org/licenses/LICENSE-2.0
+
+### Contribution
+
+Contributions are licensed under the MIT License, the Apache License,
+Version 2.0, and the Apache-2.0 license with the LLVM exception.
+
+See [LICENSE.txt](../LICENSE.txt) for full details.
--- a/library/compiler-builtins/compiler-builtins/build.rs
+++ b/library/compiler-builtins/compiler-builtins/build.rs
@ -0,0 +1,712 @@
+mod configure;
+
+use std::collections::BTreeMap;
+use std::env;
+use std::path::PathBuf;
+use std::sync::atomic::Ordering;
+
+use configure::{Target, configure_aliases, configure_f16_f128};
+
+fn main() {
+    println!("cargo::rerun-if-changed=build.rs");
+    println!("cargo::rerun-if-changed=configure.rs");
+
+    let target = Target::from_env();
+    let cwd = env::current_dir().unwrap();
+
+    configure_check_cfg();
+    configure_f16_f128(&target);
+    configure_aliases(&target);
+
+    configure_libm(&target);
+
+    println!("cargo:compiler-rt={}", cwd.join("compiler-rt").display());
+
+    // Emscripten's runtime includes all the builtins
+    if target.os == "emscripten" {
+        return;
+    }
+
+    // OpenBSD provides compiler_rt by default, use it instead of rebuilding it from source
+    if target.os == "openbsd" {
+        println!("cargo:rustc-link-search=native=/usr/lib");
+        println!("cargo:rustc-link-lib=compiler_rt");
+        return;
+    }
+
+    // Forcibly enable memory intrinsics on wasm & SGX as we don't have a libc to
+    // provide them.
+    if (target.triple.contains("wasm") && !target.triple.contains("wasi"))
+        || (target.triple.contains("sgx") && target.triple.contains("fortanix"))
+        || target.triple.contains("-none")
+        || target.triple.contains("nvptx")
+        || target.triple.contains("uefi")
+        || target.triple.contains("xous")
+    {
+        println!("cargo:rustc-cfg=feature=\"mem\"");
+    }
+
+    // These targets have hardware unaligned access support.
+    println!("cargo::rustc-check-cfg=cfg(feature, values(\"mem-unaligned\"))");
+    if target.arch.contains("x86_64")
+        || target.arch.contains("x86")
+        || target.arch.contains("aarch64")
+        || target.arch.contains("bpf")
+    {
+        println!("cargo:rustc-cfg=feature=\"mem-unaligned\"");
+    }
+
+    // NOTE we are going to assume that llvm-target, what determines our codegen option, matches the
+    // target triple. This is usually correct for our built-in targets but can break in presence of
+    // custom targets, which can have arbitrary names.
+    let llvm_target = target.triple.split('-').collect::<Vec<_>>();
+
+    // Build missing intrinsics from compiler-rt C source code. If we're
+    // mangling names though we assume that we're also in test mode so we don't
+    // build anything and we rely on the upstream implementation of compiler-rt
+    // functions
+    if !cfg!(feature = "mangled-names") && cfg!(feature = "c") {
+        // Don't use a C compiler for these targets:
+        //
+        // * nvptx - everything is bitcode, not compatible with mixed C/Rust
+        if !target.arch.contains("nvptx") {
+            #[cfg(feature = "c")]
+            c::compile(&llvm_target, &target);
+        }
+    }
+
+    // Only emit the ARM Linux atomic emulation on pre-ARMv6 architectures. This
+    // includes the old androideabi. It is deprecated but it is available as a
+    // rustc target (arm-linux-androideabi).
+    println!("cargo::rustc-check-cfg=cfg(kernel_user_helpers)");
+    if llvm_target[0] == "armv4t"
+        || llvm_target[0] == "armv5te"
+        || target.triple == "arm-linux-androideabi"
+    {
+        println!("cargo:rustc-cfg=kernel_user_helpers")
+    }
+
+    if llvm_target[0].starts_with("aarch64") {
+        generate_aarch64_outlined_atomics();
+    }
+}
+
+/// Run configuration for `libm` since it is included directly.
+///
+/// Much of this is copied from `libm/configure.rs`.
+fn configure_libm(target: &Target) {
+    println!("cargo:rustc-check-cfg=cfg(intrinsics_enabled)");
+    println!("cargo:rustc-check-cfg=cfg(arch_enabled)");
+    println!("cargo:rustc-check-cfg=cfg(optimizations_enabled)");
+    println!("cargo:rustc-check-cfg=cfg(feature, values(\"unstable-public-internals\"))");
+
+    // Always use intrinsics
+    println!("cargo:rustc-cfg=intrinsics_enabled");
+
+    // The arch module may contain assembly.
+    if !cfg!(feature = "no-asm") {
+        println!("cargo:rustc-cfg=arch_enabled");
+    }
+
+    println!("cargo:rustc-check-cfg=cfg(optimizations_enabled)");
+    if !matches!(target.opt_level.as_str(), "0" | "1") {
+        println!("cargo:rustc-cfg=optimizations_enabled");
+    }
+
+    // Config shorthands
+    println!("cargo:rustc-check-cfg=cfg(x86_no_sse)");
+    if target.arch == "x86" && !target.features.iter().any(|f| f == "sse") {
+        // Shorthand to detect i586 targets
+        println!("cargo:rustc-cfg=x86_no_sse");
+    }
+
+    println!(
+        "cargo:rustc-env=CFG_CARGO_FEATURES={:?}",
+        target.cargo_features
+    );
+    println!("cargo:rustc-env=CFG_OPT_LEVEL={}", target.opt_level);
+    println!("cargo:rustc-env=CFG_TARGET_FEATURES={:?}", target.features);
+
+    // Activate libm's unstable features to make full use of Nightly.
+    println!("cargo:rustc-cfg=feature=\"unstable-intrinsics\"");
+}
+
+fn aarch64_symbol(ordering: Ordering) -> &'static str {
+    match ordering {
+        Ordering::Relaxed => "relax",
+        Ordering::Acquire => "acq",
+        Ordering::Release => "rel",
+        Ordering::AcqRel => "acq_rel",
+        _ => panic!("unknown symbol for {ordering:?}"),
+    }
+}
+
+/// The `concat_idents` macro is extremely annoying and doesn't allow us to define new items.
+/// Define them from the build script instead.
+/// Note that the majority of the code is still defined in `aarch64.rs` through inline macros.
+fn generate_aarch64_outlined_atomics() {
+    use std::fmt::Write;
+    // #[macro_export] so that we can use this in tests
+    let gen_macro =
+        |name| format!("#[macro_export] macro_rules! foreach_{name} {{ ($macro:path) => {{\n");
+
+    // Generate different macros for add/clr/eor/set so that we can test them separately.
+    let sym_names = ["cas", "ldadd", "ldclr", "ldeor", "ldset", "swp"];
+    let mut macros = BTreeMap::new();
+    for sym in sym_names {
+        macros.insert(sym, gen_macro(sym));
+    }
+
+    // Only CAS supports 16 bytes, and it has a different implementation that uses a different macro.
+    let mut cas16 = gen_macro("cas16");
+
+    for ordering in [
+        Ordering::Relaxed,
+        Ordering::Acquire,
+        Ordering::Release,
+        Ordering::AcqRel,
+    ] {
+        let sym_ordering = aarch64_symbol(ordering);
+        for size in [1, 2, 4, 8] {
+            for (sym, macro_) in &mut macros {
+                let name = format!("__aarch64_{sym}{size}_{sym_ordering}");
+                writeln!(macro_, "$macro!( {ordering:?}, {size}, {name} );").unwrap();
+            }
+        }
+        let name = format!("__aarch64_cas16_{sym_ordering}");
+        writeln!(cas16, "$macro!( {ordering:?}, {name} );").unwrap();
+    }
+
+    let mut buf = String::new();
+    for macro_def in macros.values().chain(std::iter::once(&cas16)) {
+        buf += macro_def;
+        buf += "}; }\n";
+    }
+    let out_dir = PathBuf::from(std::env::var("OUT_DIR").unwrap());
+    std::fs::write(out_dir.join("outlined_atomics.rs"), buf).unwrap();
+}
+
+/// Emit directives for features we expect to support that aren't in `Cargo.toml`.
+///
+/// These are mostly cfg elements emitted by this `build.rs`.
+fn configure_check_cfg() {
+    // Functions where we can set the "optimized-c" flag
+    const HAS_OPTIMIZED_C: &[&str] = &[
+        "__ashldi3",
+        "__ashlsi3",
+        "__ashrdi3",
+        "__ashrsi3",
+        "__bswapsi2",
+        "__bswapdi2",
+        "__bswapti2",
+        "__divdi3",
+        "__divsi3",
+        "__divmoddi4",
+        "__divmodsi4",
+        "__divmodsi4",
+        "__divmodti4",
+        "__lshrdi3",
+        "__lshrsi3",
+        "__moddi3",
+        "__modsi3",
+        "__muldi3",
+        "__udivdi3",
+        "__udivmoddi4",
+        "__udivmodsi4",
+        "__udivsi3",
+        "__umoddi3",
+        "__umodsi3",
+    ];
+
+    // Build a list of all aarch64 atomic operation functions
+    let mut aarch_atomic = Vec::new();
+    for aarch_op in ["cas", "ldadd", "ldclr", "ldeor", "ldset", "swp"] {
+        let op_sizes = if aarch_op == "cas" {
+            [1, 2, 4, 8, 16].as_slice()
+        } else {
+            [1, 2, 4, 8].as_slice()
+        };
+
+        for op_size in op_sizes {
+            for ordering in ["relax", "acq", "rel", "acq_rel"] {
+                aarch_atomic.push(format!("__aarch64_{aarch_op}{op_size}_{ordering}"));
+            }
+        }
+    }
+
+    for fn_name in HAS_OPTIMIZED_C
+        .iter()
+        .copied()
+        .chain(aarch_atomic.iter().map(|s| s.as_str()))
+    {
+        println!("cargo::rustc-check-cfg=cfg({fn_name}, values(\"optimized-c\"))",);
+    }
+
+    // Rustc is unaware of sparc target features, but this does show up from
+    // `rustc --print target-features --target sparc64-unknown-linux-gnu`.
+    println!("cargo::rustc-check-cfg=cfg(target_feature, values(\"vis3\"))");
+
+    // FIXME: these come from libm and should be changed there
+    println!("cargo::rustc-check-cfg=cfg(feature, values(\"checked\"))");
+    println!("cargo::rustc-check-cfg=cfg(assert_no_panic)");
+}
+
+#[cfg(feature = "c")]
+mod c {
+    use std::collections::{BTreeMap, HashSet};
+    use std::env;
+    use std::fs::{self, File};
+    use std::io::Write;
+    use std::path::{Path, PathBuf};
+
+    use super::Target;
+
+    struct Sources {
+        // SYMBOL -> PATH TO SOURCE
+        map: BTreeMap<&'static str, &'static str>,
+    }
+
+    impl Sources {
+        fn new() -> Sources {
+            Sources {
+                map: BTreeMap::new(),
+            }
+        }
+
+        fn extend(&mut self, sources: &[(&'static str, &'static str)]) {
+            // NOTE Some intrinsics have both a generic implementation (e.g.
+            // `floatdidf.c`) and an arch optimized implementation
+            // (`x86_64/floatdidf.c`). In those cases, we keep the arch optimized
+            // implementation and discard the generic implementation. If we don't
+            // and keep both implementations, the linker will yell at us about
+            // duplicate symbols!
+            for (symbol, src) in sources {
+                if src.contains("/") {
+                    // Arch-optimized implementation (preferred)
+                    self.map.insert(symbol, src);
+                } else {
+                    // Generic implementation
+                    if !self.map.contains_key(symbol) {
+                        self.map.insert(symbol, src);
+                    }
+                }
+            }
+        }
+
+        fn remove(&mut self, symbols: &[&str]) {
+            for symbol in symbols {
+                self.map.remove(*symbol).unwrap();
+            }
+        }
+    }
+
+    /// Compile intrinsics from the compiler-rt C source code
+    pub fn compile(llvm_target: &[&str], target: &Target) {
+        let mut consider_float_intrinsics = true;
+        let cfg = &mut cc::Build::new();
+
+        // AArch64 GCCs exit with an error condition when they encounter any kind of floating point
+        // code if the `nofp` and/or `nosimd` compiler flags have been set.
+        //
+        // Therefore, evaluate if those flags are present and set a boolean that causes any
+        // compiler-rt intrinsics that contain floating point source to be excluded for this target.
+        if target.arch == "aarch64" {
+            let cflags_key = String::from("CFLAGS_") + &(target.triple.replace("-", "_"));
+            if let Ok(cflags_value) = env::var(cflags_key) {
+                if cflags_value.contains("+nofp") || cflags_value.contains("+nosimd") {
+                    consider_float_intrinsics = false;
+                }
+            }
+        }
+
+        // `compiler-rt` requires `COMPILER_RT_HAS_FLOAT16` to be defined to make it use the
+        // `_Float16` type for `f16` intrinsics. This shouldn't matter as all existing `f16`
+        // intrinsics have been ported to Rust in `compiler-builtins` as C compilers don't
+        // support `_Float16` on all targets (whereas Rust does). However, define the macro
+        // anyway to prevent issues like rust#118813 and rust#123885 silently reoccuring if more
+        // `f16` intrinsics get accidentally added here in the future.
+        cfg.define("COMPILER_RT_HAS_FLOAT16", None);
+
+        cfg.warnings(false);
+
+        if target.env == "msvc" {
+            // Don't pull in extra libraries on MSVC
+            cfg.flag("/Zl");
+
+            // Emulate C99 and C++11's __func__ for MSVC prior to 2013 CTP
+            cfg.define("__func__", Some("__FUNCTION__"));
+        } else {
+            // Turn off various features of gcc and such, mostly copying
+            // compiler-rt's build system already
+            cfg.flag("-fno-builtin");
+            cfg.flag("-fvisibility=hidden");
+            cfg.flag("-ffreestanding");
+            // Avoid the following warning appearing once **per file**:
+            // clang: warning: optimization flag '-fomit-frame-pointer' is not supported for target 'armv7' [-Wignored-optimization-argument]
+            //
+            // Note that compiler-rt's build system also checks
+            //
+            // `check_cxx_compiler_flag(-fomit-frame-pointer COMPILER_RT_HAS_FOMIT_FRAME_POINTER_FLAG)`
+            //
+            // in https://github.com/rust-lang/compiler-rt/blob/c8fbcb3/cmake/config-ix.cmake#L19.
+            cfg.flag_if_supported("-fomit-frame-pointer");
+            cfg.define("VISIBILITY_HIDDEN", None);
+
+            if let "aarch64" | "arm64ec" = target.arch.as_str() {
+                // FIXME(llvm20): Older GCCs on A64 fail to build with
+                // -Werror=implicit-function-declaration due to a compiler-rt bug.
+                // With a newer LLVM we should be able to enable the flag everywhere.
+                // https://github.com/llvm/llvm-project/commit/8aa9d6206ce55bdaaf422839c351fbd63f033b89
+            } else {
+                // Avoid implicitly creating references to undefined functions
+                cfg.flag("-Werror=implicit-function-declaration");
+            }
+        }
+
+        // int_util.c tries to include stdlib.h if `_WIN32` is defined,
+        // which it is when compiling UEFI targets with clang. This is
+        // at odds with compiling with `-ffreestanding`, as the header
+        // may be incompatible or not present. Create a minimal stub
+        // header to use instead.
+        if target.os == "uefi" {
+            let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
+            let include_dir = out_dir.join("include");
+            if !include_dir.exists() {
+                fs::create_dir(&include_dir).unwrap();
+            }
+            fs::write(include_dir.join("stdlib.h"), "#include <stddef.h>").unwrap();
+            cfg.flag(&format!("-I{}", include_dir.to_str().unwrap()));
+        }
+
+        let mut sources = Sources::new();
+        sources.extend(&[
+            ("__absvdi2", "absvdi2.c"),
+            ("__absvsi2", "absvsi2.c"),
+            ("__addvdi3", "addvdi3.c"),
+            ("__addvsi3", "addvsi3.c"),
+            ("__cmpdi2", "cmpdi2.c"),
+            ("__int_util", "int_util.c"),
+            ("__mulvdi3", "mulvdi3.c"),
+            ("__mulvsi3", "mulvsi3.c"),
+            ("__negdi2", "negdi2.c"),
+            ("__negvdi2", "negvdi2.c"),
+            ("__negvsi2", "negvsi2.c"),
+            ("__paritydi2", "paritydi2.c"),
+            ("__paritysi2", "paritysi2.c"),
+            ("__popcountdi2", "popcountdi2.c"),
+            ("__popcountsi2", "popcountsi2.c"),
+            ("__subvdi3", "subvdi3.c"),
+            ("__subvsi3", "subvsi3.c"),
+            ("__ucmpdi2", "ucmpdi2.c"),
+        ]);
+
+        if consider_float_intrinsics {
+            sources.extend(&[
+                ("__divdc3", "divdc3.c"),
+                ("__divsc3", "divsc3.c"),
+                ("__muldc3", "muldc3.c"),
+                ("__mulsc3", "mulsc3.c"),
+                ("__negdf2", "negdf2.c"),
+                ("__negsf2", "negsf2.c"),
+            ]);
+        }
+
+        // On iOS and 32-bit OSX these are all just empty intrinsics, no need to
+        // include them.
+        if target.vendor != "apple" || target.arch != "x86" {
+            sources.extend(&[
+                ("__absvti2", "absvti2.c"),
+                ("__addvti3", "addvti3.c"),
+                ("__cmpti2", "cmpti2.c"),
+                ("__ffsti2", "ffsti2.c"),
+                ("__mulvti3", "mulvti3.c"),
+                ("__negti2", "negti2.c"),
+                ("__parityti2", "parityti2.c"),
+                ("__popcountti2", "popcountti2.c"),
+                ("__subvti3", "subvti3.c"),
+                ("__ucmpti2", "ucmpti2.c"),
+            ]);
+
+            if consider_float_intrinsics {
+                sources.extend(&[("__negvti2", "negvti2.c")]);
+            }
+        }
+
+        if target.vendor == "apple" {
+            sources.extend(&[
+                ("atomic_flag_clear", "atomic_flag_clear.c"),
+                ("atomic_flag_clear_explicit", "atomic_flag_clear_explicit.c"),
+                ("atomic_flag_test_and_set", "atomic_flag_test_and_set.c"),
+                (
+                    "atomic_flag_test_and_set_explicit",
+                    "atomic_flag_test_and_set_explicit.c",
+                ),
+                ("atomic_signal_fence", "atomic_signal_fence.c"),
+                ("atomic_thread_fence", "atomic_thread_fence.c"),
+            ]);
+        }
+
+        if target.env != "msvc" {
+            if target.arch == "x86" {
+                sources.extend(&[
+                    ("__ashldi3", "i386/ashldi3.S"),
+                    ("__ashrdi3", "i386/ashrdi3.S"),
+                    ("__divdi3", "i386/divdi3.S"),
+                    ("__lshrdi3", "i386/lshrdi3.S"),
+                    ("__moddi3", "i386/moddi3.S"),
+                    ("__muldi3", "i386/muldi3.S"),
+                    ("__udivdi3", "i386/udivdi3.S"),
+                    ("__umoddi3", "i386/umoddi3.S"),
+                ]);
+            }
+        }
+
+        if target.arch == "arm" && target.vendor != "apple" && target.env != "msvc" {
+            sources.extend(&[
+                ("__aeabi_div0", "arm/aeabi_div0.c"),
+                ("__aeabi_drsub", "arm/aeabi_drsub.c"),
+                ("__aeabi_frsub", "arm/aeabi_frsub.c"),
+                ("__bswapdi2", "arm/bswapdi2.S"),
+                ("__bswapsi2", "arm/bswapsi2.S"),
+                ("__divmodsi4", "arm/divmodsi4.S"),
+                ("__divsi3", "arm/divsi3.S"),
+                ("__modsi3", "arm/modsi3.S"),
+                ("__switch16", "arm/switch16.S"),
+                ("__switch32", "arm/switch32.S"),
+                ("__switch8", "arm/switch8.S"),
+                ("__switchu8", "arm/switchu8.S"),
+                ("__sync_synchronize", "arm/sync_synchronize.S"),
+                ("__udivmodsi4", "arm/udivmodsi4.S"),
+                ("__udivsi3", "arm/udivsi3.S"),
+                ("__umodsi3", "arm/umodsi3.S"),
+            ]);
+
+            if target.os == "freebsd" {
+                sources.extend(&[("__clear_cache", "clear_cache.c")]);
+            }
+
+            // First of all aeabi_cdcmp and aeabi_cfcmp are never called by LLVM.
+            // Second are little-endian only, so build fail on big-endian targets.
+            // Temporally workaround: exclude these files for big-endian targets.
+            if !llvm_target[0].starts_with("thumbeb") && !llvm_target[0].starts_with("armeb") {
+                sources.extend(&[
+                    ("__aeabi_cdcmp", "arm/aeabi_cdcmp.S"),
+                    ("__aeabi_cdcmpeq_check_nan", "arm/aeabi_cdcmpeq_check_nan.c"),
+                    ("__aeabi_cfcmp", "arm/aeabi_cfcmp.S"),
+                    ("__aeabi_cfcmpeq_check_nan", "arm/aeabi_cfcmpeq_check_nan.c"),
+                ]);
+            }
+        }
+
+        if llvm_target[0] == "armv7" {
+            sources.extend(&[
+                ("__sync_fetch_and_add_4", "arm/sync_fetch_and_add_4.S"),
+                ("__sync_fetch_and_add_8", "arm/sync_fetch_and_add_8.S"),
+                ("__sync_fetch_and_and_4", "arm/sync_fetch_and_and_4.S"),
+                ("__sync_fetch_and_and_8", "arm/sync_fetch_and_and_8.S"),
+                ("__sync_fetch_and_max_4", "arm/sync_fetch_and_max_4.S"),
+                ("__sync_fetch_and_max_8", "arm/sync_fetch_and_max_8.S"),
+                ("__sync_fetch_and_min_4", "arm/sync_fetch_and_min_4.S"),
+                ("__sync_fetch_and_min_8", "arm/sync_fetch_and_min_8.S"),
+                ("__sync_fetch_and_nand_4", "arm/sync_fetch_and_nand_4.S"),
+                ("__sync_fetch_and_nand_8", "arm/sync_fetch_and_nand_8.S"),
+                ("__sync_fetch_and_or_4", "arm/sync_fetch_and_or_4.S"),
+                ("__sync_fetch_and_or_8", "arm/sync_fetch_and_or_8.S"),
+                ("__sync_fetch_and_sub_4", "arm/sync_fetch_and_sub_4.S"),
+                ("__sync_fetch_and_sub_8", "arm/sync_fetch_and_sub_8.S"),
+                ("__sync_fetch_and_umax_4", "arm/sync_fetch_and_umax_4.S"),
+                ("__sync_fetch_and_umax_8", "arm/sync_fetch_and_umax_8.S"),
+                ("__sync_fetch_and_umin_4", "arm/sync_fetch_and_umin_4.S"),
+                ("__sync_fetch_and_umin_8", "arm/sync_fetch_and_umin_8.S"),
+                ("__sync_fetch_and_xor_4", "arm/sync_fetch_and_xor_4.S"),
+                ("__sync_fetch_and_xor_8", "arm/sync_fetch_and_xor_8.S"),
+            ]);
+        }
+
+        if llvm_target.last().unwrap().ends_with("eabihf") {
+            if !llvm_target[0].starts_with("thumbv7em")
+                && !llvm_target[0].starts_with("thumbv8m.main")
+            {
+                // The FPU option chosen for these architectures in cc-rs, ie:
+                //     -mfpu=fpv4-sp-d16 for thumbv7em
+                //     -mfpu=fpv5-sp-d16 for thumbv8m.main
+                // do not support double precision floating points conversions so the files
+                // that include such instructions are not included for these targets.
+                sources.extend(&[
+                    ("__fixdfsivfp", "arm/fixdfsivfp.S"),
+                    ("__fixunsdfsivfp", "arm/fixunsdfsivfp.S"),
+                    ("__floatsidfvfp", "arm/floatsidfvfp.S"),
+                    ("__floatunssidfvfp", "arm/floatunssidfvfp.S"),
+                ]);
+            }
+
+            sources.extend(&[
+                ("__fixsfsivfp", "arm/fixsfsivfp.S"),
+                ("__fixunssfsivfp", "arm/fixunssfsivfp.S"),
+                ("__floatsisfvfp", "arm/floatsisfvfp.S"),
+                ("__floatunssisfvfp", "arm/floatunssisfvfp.S"),
+                ("__floatunssisfvfp", "arm/floatunssisfvfp.S"),
+                ("__restore_vfp_d8_d15_regs", "arm/restore_vfp_d8_d15_regs.S"),
+                ("__save_vfp_d8_d15_regs", "arm/save_vfp_d8_d15_regs.S"),
+                ("__negdf2vfp", "arm/negdf2vfp.S"),
+                ("__negsf2vfp", "arm/negsf2vfp.S"),
+            ]);
+        }
+
+        if (target.arch == "aarch64" || target.arch == "arm64ec") && consider_float_intrinsics {
+            sources.extend(&[
+                ("__comparetf2", "comparetf2.c"),
+                ("__fe_getround", "fp_mode.c"),
+                ("__fe_raise_inexact", "fp_mode.c"),
+            ]);
+
+            if target.os != "windows" && target.os != "cygwin" {
+                sources.extend(&[("__multc3", "multc3.c")]);
+            }
+        }
+
+        if target.arch == "mips" || target.arch == "riscv32" || target.arch == "riscv64" {
+            sources.extend(&[("__bswapsi2", "bswapsi2.c")]);
+        }
+
+        if target.arch == "mips64" {
+            sources.extend(&[("__netf2", "comparetf2.c"), ("__fe_getround", "fp_mode.c")]);
+        }
+
+        if target.arch == "loongarch64" {
+            sources.extend(&[("__netf2", "comparetf2.c"), ("__fe_getround", "fp_mode.c")]);
+        }
+
+        // Remove the assembly implementations that won't compile for the target
+        if llvm_target[0] == "thumbv6m" || llvm_target[0] == "thumbv8m.base" || target.os == "uefi"
+        {
+            let mut to_remove = Vec::new();
+            for (k, v) in sources.map.iter() {
+                if v.ends_with(".S") {
+                    to_remove.push(*k);
+                }
+            }
+            sources.remove(&to_remove);
+        }
+
+        if llvm_target[0] == "thumbv7m" || llvm_target[0] == "thumbv7em" {
+            sources.remove(&["__aeabi_cdcmp", "__aeabi_cfcmp"]);
+        }
+
+        // Android and Cygwin uses emulated TLS so we need a runtime support function.
+        if target.os == "android" || target.os == "cygwin" {
+            sources.extend(&[("__emutls_get_address", "emutls.c")]);
+        }
+
+        // Work around a bug in the NDK headers (fixed in
+        // https://r.android.com/2038949 which will be released in a future
+        // NDK version) by providing a definition of LONG_BIT.
+        if target.os == "android" {
+            cfg.define("LONG_BIT", "(8 * sizeof(long))");
+        }
+
+        // OpenHarmony also uses emulated TLS.
+        if target.env == "ohos" {
+            sources.extend(&[("__emutls_get_address", "emutls.c")]);
+        }
+
+        // When compiling the C code we require the user to tell us where the
+        // source code is, and this is largely done so when we're compiling as
+        // part of rust-lang/rust we can use the same llvm-project repository as
+        // rust-lang/rust.
+        let root = match env::var_os("RUST_COMPILER_RT_ROOT") {
+            Some(s) => PathBuf::from(s),
+            None => {
+                panic!(
+                    "RUST_COMPILER_RT_ROOT is not set. You may need to run \
+                    `ci/download-compiler-rt.sh`."
+                );
+            }
+        };
+        if !root.exists() {
+            panic!("RUST_COMPILER_RT_ROOT={} does not exist", root.display());
+        }
+
+        // Support deterministic builds by remapping the __FILE__ prefix if the
+        // compiler supports it.  This fixes the nondeterminism caused by the
+        // use of that macro in lib/builtins/int_util.h in compiler-rt.
+        cfg.flag_if_supported(&format!("-ffile-prefix-map={}=.", root.display()));
+
+        // Include out-of-line atomics for aarch64, which are all generated by supplying different
+        // sets of flags to the same source file.
+        // Note: Out-of-line aarch64 atomics are not supported by the msvc toolchain (#430) and
+        // on uefi.
+        let src_dir = root.join("lib/builtins");
+        if target.arch == "aarch64" && target.env != "msvc" && target.os != "uefi" {
+            // See below for why we're building these as separate libraries.
+            build_aarch64_out_of_line_atomics_libraries(&src_dir, cfg);
+
+            // Some run-time CPU feature detection is necessary, as well.
+            let cpu_model_src = if src_dir.join("cpu_model.c").exists() {
+                "cpu_model.c"
+            } else {
+                "cpu_model/aarch64.c"
+            };
+            sources.extend(&[("__aarch64_have_lse_atomics", cpu_model_src)]);
+        }
+
+        let mut added_sources = HashSet::new();
+        for (sym, src) in sources.map.iter() {
+            let src = src_dir.join(src);
+            if added_sources.insert(src.clone()) {
+                cfg.file(&src);
+                println!("cargo:rerun-if-changed={}", src.display());
+            }
+            println!("cargo:rustc-cfg={}=\"optimized-c\"", sym);
+        }
+
+        cfg.compile("libcompiler-rt.a");
+    }
+
+    fn build_aarch64_out_of_line_atomics_libraries(builtins_dir: &Path, cfg: &mut cc::Build) {
+        let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
+        let outlined_atomics_file = builtins_dir.join("aarch64").join("lse.S");
+        println!("cargo:rerun-if-changed={}", outlined_atomics_file.display());
+
+        cfg.include(&builtins_dir);
+
+        for instruction_type in &["cas", "swp", "ldadd", "ldclr", "ldeor", "ldset"] {
+            for size in &[1, 2, 4, 8, 16] {
+                if *size == 16 && *instruction_type != "cas" {
+                    continue;
+                }
+
+                for (model_number, model_name) in
+                    &[(1, "relax"), (2, "acq"), (3, "rel"), (4, "acq_rel")]
+                {
+                    // The original compiler-rt build system compiles the same
+                    // source file multiple times with different compiler
+                    // options. Here we do something slightly different: we
+                    // create multiple .S files with the proper #defines and
+                    // then include the original file.
+                    //
+                    // This is needed because the cc crate doesn't allow us to
+                    // override the name of object files and libtool requires
+                    // all objects in an archive to have unique names.
+                    let path =
+                        out_dir.join(format!("lse_{}{}_{}.S", instruction_type, size, model_name));
+                    let mut file = File::create(&path).unwrap();
+                    writeln!(file, "#define L_{}", instruction_type).unwrap();
+                    writeln!(file, "#define SIZE {}", size).unwrap();
+                    writeln!(file, "#define MODEL {}", model_number).unwrap();
+                    writeln!(
+                        file,
+                        "#include \"{}\"",
+                        outlined_atomics_file.canonicalize().unwrap().display()
+                    )
+                    .unwrap();
+                    drop(file);
+                    cfg.file(path);
+
+                    let sym = format!("__aarch64_{}{}_{}", instruction_type, size, model_name);
+                    println!("cargo:rustc-cfg={}=\"optimized-c\"", sym);
+                }
+            }
+        }
+    }
+}
--- a/library/compiler-builtins/compiler-builtins/configure.rs
+++ b/library/compiler-builtins/compiler-builtins/configure.rs
@ -0,0 +1,136 @@
+// Configuration that is shared between `compiler_builtins` and `builtins_test`.
+
+use std::env;
+
+#[derive(Debug)]
+#[allow(dead_code)]
+pub struct Target {
+    pub triple: String,
+    pub triple_split: Vec<String>,
+    pub opt_level: String,
+    pub cargo_features: Vec<String>,
+    pub os: String,
+    pub arch: String,
+    pub vendor: String,
+    pub env: String,
+    pub pointer_width: u8,
+    pub little_endian: bool,
+    pub features: Vec<String>,
+}
+
+impl Target {
+    pub fn from_env() -> Self {
+        let triple = env::var("TARGET").unwrap();
+        let triple_split = triple.split('-').map(ToOwned::to_owned).collect();
+        let little_endian = match env::var("CARGO_CFG_TARGET_ENDIAN").unwrap().as_str() {
+            "little" => true,
+            "big" => false,
+            x => panic!("unknown endian {x}"),
+        };
+        let cargo_features = env::vars()
+            .filter_map(|(name, _value)| name.strip_prefix("CARGO_FEATURE_").map(ToOwned::to_owned))
+            .map(|s| s.to_lowercase().replace("_", "-"))
+            .collect();
+
+        Self {
+            triple,
+            triple_split,
+            os: env::var("CARGO_CFG_TARGET_OS").unwrap(),
+            opt_level: env::var("OPT_LEVEL").unwrap(),
+            cargo_features,
+            arch: env::var("CARGO_CFG_TARGET_ARCH").unwrap(),
+            vendor: env::var("CARGO_CFG_TARGET_VENDOR").unwrap(),
+            env: env::var("CARGO_CFG_TARGET_ENV").unwrap(),
+            pointer_width: env::var("CARGO_CFG_TARGET_POINTER_WIDTH")
+                .unwrap()
+                .parse()
+                .unwrap(),
+            little_endian,
+            features: env::var("CARGO_CFG_TARGET_FEATURE")
+                .unwrap_or_default()
+                .split(",")
+                .map(ToOwned::to_owned)
+                .collect(),
+        }
+    }
+
+    #[allow(dead_code)]
+    pub fn has_feature(&self, feature: &str) -> bool {
+        self.features.iter().any(|f| f == feature)
+    }
+}
+
+pub fn configure_aliases(target: &Target) {
+    // To compile builtins-test-intrinsics for thumb targets, where there is no libc
+    println!("cargo::rustc-check-cfg=cfg(thumb)");
+    if target.triple_split[0].starts_with("thumb") {
+        println!("cargo:rustc-cfg=thumb")
+    }
+
+    // compiler-rt `cfg`s away some intrinsics for thumbv6m and thumbv8m.base because
+    // these targets do not have full Thumb-2 support but only original Thumb-1.
+    // We have to cfg our code accordingly.
+    println!("cargo::rustc-check-cfg=cfg(thumb_1)");
+    if target.triple_split[0] == "thumbv6m" || target.triple_split[0] == "thumbv8m.base" {
+        println!("cargo:rustc-cfg=thumb_1")
+    }
+}
+
+/// Configure whether or not `f16` and `f128` support should be enabled.
+pub fn configure_f16_f128(target: &Target) {
+    // Set whether or not `f16` and `f128` are supported at a basic level by LLVM. This only means
+    // that the backend will not crash when using these types and generates code that can be called
+    // without crashing (no infinite recursion). This does not mean that the platform doesn't have
+    // ABI or other bugs.
+    //
+    // We do this here rather than in `rust-lang/rust` because configuring via cargo features is
+    // not straightforward.
+    //
+    // Original source of this list:
+    // <https://github.com/rust-lang/compiler-builtins/pull/652#issuecomment-2266151350>
+    let f16_enabled = match target.arch.as_str() {
+        // Unsupported <https://github.com/llvm/llvm-project/issues/94434>
+        "arm64ec" => false,
+        // Selection failure <https://github.com/llvm/llvm-project/issues/50374>
+        "s390x" => false,
+        // Infinite recursion <https://github.com/llvm/llvm-project/issues/97981>
+        "csky" => false,
+        "hexagon" => false,
+        "powerpc" | "powerpc64" => false,
+        "sparc" | "sparc64" => false,
+        "wasm32" | "wasm64" => false,
+        // Most everything else works as of LLVM 19
+        _ => true,
+    };
+
+    let f128_enabled = match target.arch.as_str() {
+        // Unsupported (libcall is not supported) <https://github.com/llvm/llvm-project/issues/121122>
+        "amdgpu" => false,
+        // Unsupported <https://github.com/llvm/llvm-project/issues/94434>
+        "arm64ec" => false,
+        // FIXME(llvm20): fixed by <https://github.com/llvm/llvm-project/pull/117525>
+        "mips64" | "mips64r6" => false,
+        // Selection failure <https://github.com/llvm/llvm-project/issues/95471>
+        "nvptx64" => false,
+        // Selection failure <https://github.com/llvm/llvm-project/issues/101545>
+        "powerpc64" if &target.os == "aix" => false,
+        // Selection failure <https://github.com/llvm/llvm-project/issues/41838>
+        "sparc" => false,
+        // Most everything else works as of LLVM 19
+        _ => true,
+    };
+
+    // If the feature is set, disable these types.
+    let disable_both = env::var_os("CARGO_FEATURE_NO_F16_F128").is_some();
+
+    println!("cargo::rustc-check-cfg=cfg(f16_enabled)");
+    println!("cargo::rustc-check-cfg=cfg(f128_enabled)");
+
+    if f16_enabled && !disable_both {
+        println!("cargo::rustc-cfg=f16_enabled");
+    }
+
+    if f128_enabled && !disable_both {
+        println!("cargo::rustc-cfg=f128_enabled");
+    }
+}
--- a/library/compiler-builtins/compiler-builtins/src/aarch64.rs
+++ b/library/compiler-builtins/compiler-builtins/src/aarch64.rs
@ -0,0 +1,21 @@
+#![allow(unused_imports)]
+
+use core::intrinsics;
+
+intrinsics! {
+    #[unsafe(naked)]
+    #[cfg(all(target_os = "uefi", not(feature = "no-asm")))]
+    pub unsafe extern "C" fn __chkstk() {
+        core::arch::naked_asm!(
+            ".p2align 2",
+            "lsl    x16, x15, #4",
+            "mov    x17, sp",
+            "1:",
+            "sub    x17, x17, 4096",
+            "subs   x16, x16, 4096",
+            "ldr    xzr, [x17]",
+            "b.gt   1b",
+            "ret",
+        );
+    }
+}
--- a/library/compiler-builtins/compiler-builtins/src/aarch64_linux.rs
+++ b/library/compiler-builtins/compiler-builtins/src/aarch64_linux.rs
@ -0,0 +1,273 @@
+//! Aarch64 targets have two possible implementations for atomics:
+//! 1. Load-Locked, Store-Conditional (LL/SC), older and slower.
+//! 2. Large System Extensions (LSE), newer and faster.
+//! To avoid breaking backwards compat, C toolchains introduced a concept of "outlined atomics",
+//! where atomic operations call into the compiler runtime to dispatch between two depending on
+//! which is supported on the current CPU.
+//! See https://community.arm.com/arm-community-blogs/b/tools-software-ides-blog/posts/making-the-most-of-the-arm-architecture-in-gcc-10#:~:text=out%20of%20line%20atomics for more discussion.
+//!
+//! Currently we only support LL/SC, because LSE requires `getauxval` from libc in order to do runtime detection.
+//! Use the `compiler-rt` intrinsics if you want LSE support.
+//!
+//! Ported from `aarch64/lse.S` in LLVM's compiler-rt.
+//!
+//! Generate functions for each of the following symbols:
+//!  __aarch64_casM_ORDER
+//!  __aarch64_swpN_ORDER
+//!  __aarch64_ldaddN_ORDER
+//!  __aarch64_ldclrN_ORDER
+//!  __aarch64_ldeorN_ORDER
+//!  __aarch64_ldsetN_ORDER
+//! for N = {1, 2, 4, 8}, M = {1, 2, 4, 8, 16}, ORDER = { relax, acq, rel, acq_rel }
+//!
+//! The original `lse.S` has some truly horrifying code that expects to be compiled multiple times with different constants.
+//! We do something similar, but with macro arguments.
+#![cfg_attr(feature = "c", allow(unused_macros))] // avoid putting the macros into a submodule
+
+// We don't do runtime dispatch so we don't have to worry about the `__aarch64_have_lse_atomics` global ctor.
+
+/// Translate a byte size to a Rust type.
+#[rustfmt::skip]
+macro_rules! int_ty {
+    (1) => { i8 };
+    (2) => { i16 };
+    (4) => { i32 };
+    (8) => { i64 };
+    (16) => { i128 };
+}
+
+/// Given a byte size and a register number, return a register of the appropriate size.
+///
+/// See <https://developer.arm.com/documentation/102374/0101/Registers-in-AArch64---general-purpose-registers>.
+#[rustfmt::skip]
+macro_rules! reg {
+    (1, $num:literal) => { concat!("w", $num) };
+    (2, $num:literal) => { concat!("w", $num) };
+    (4, $num:literal) => { concat!("w", $num) };
+    (8, $num:literal) => { concat!("x", $num) };
+}
+
+/// Given an atomic ordering, translate it to the acquire suffix for the lxdr aarch64 ASM instruction.
+#[rustfmt::skip]
+macro_rules! acquire {
+    (Relaxed) => { "" };
+    (Acquire) => { "a" };
+    (Release) => { "" };
+    (AcqRel) => { "a" };
+}
+
+/// Given an atomic ordering, translate it to the release suffix for the stxr aarch64 ASM instruction.
+#[rustfmt::skip]
+macro_rules! release {
+    (Relaxed) => { "" };
+    (Acquire) => { "" };
+    (Release) => { "l" };
+    (AcqRel) => { "l" };
+}
+
+/// Given a size in bytes, translate it to the byte suffix for an aarch64 ASM instruction.
+#[rustfmt::skip]
+macro_rules! size {
+    (1) => { "b" };
+    (2) => { "h" };
+    (4) => { "" };
+    (8) => { "" };
+    (16) => { "" };
+}
+
+/// Given a byte size, translate it to an Unsigned eXTend instruction
+/// with the correct semantics.
+///
+/// See <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/UXTB--Unsigned-Extend-Byte--an-alias-of-UBFM->
+#[rustfmt::skip]
+macro_rules! uxt {
+    (1) => { "uxtb" };
+    (2) => { "uxth" };
+    ($_:tt) => { "mov" };
+}
+
+/// Given an atomic ordering and byte size, translate it to a LoaD eXclusive Register instruction
+/// with the correct semantics.
+///
+/// See <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/LDXR--Load-Exclusive-Register->.
+macro_rules! ldxr {
+    ($ordering:ident, $bytes:tt) => {
+        concat!("ld", acquire!($ordering), "xr", size!($bytes))
+    };
+}
+
+/// Given an atomic ordering and byte size, translate it to a STore eXclusive Register instruction
+/// with the correct semantics.
+///
+/// See <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/STXR--Store-Exclusive-Register->.
+macro_rules! stxr {
+    ($ordering:ident, $bytes:tt) => {
+        concat!("st", release!($ordering), "xr", size!($bytes))
+    };
+}
+
+/// Given an atomic ordering and byte size, translate it to a LoaD eXclusive Pair of registers instruction
+/// with the correct semantics.
+///
+/// See <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/LDXP--Load-Exclusive-Pair-of-Registers->
+macro_rules! ldxp {
+    ($ordering:ident) => {
+        concat!("ld", acquire!($ordering), "xp")
+    };
+}
+
+/// Given an atomic ordering and byte size, translate it to a STore eXclusive Pair of registers instruction
+/// with the correct semantics.
+///
+/// See <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/STXP--Store-Exclusive-Pair-of-registers->.
+macro_rules! stxp {
+    ($ordering:ident) => {
+        concat!("st", release!($ordering), "xp")
+    };
+}
+
+/// See <https://doc.rust-lang.org/stable/std/sync/atomic/struct.AtomicI8.html#method.compare_and_swap>.
+macro_rules! compare_and_swap {
+    ($ordering:ident, $bytes:tt, $name:ident) => {
+        intrinsics! {
+            #[maybe_use_optimized_c_shim]
+            #[unsafe(naked)]
+            pub unsafe extern "C" fn $name (
+                expected: int_ty!($bytes), desired: int_ty!($bytes), ptr: *mut int_ty!($bytes)
+            ) -> int_ty!($bytes) {
+                // We can't use `AtomicI8::compare_and_swap`; we *are* compare_and_swap.
+                core::arch::naked_asm! {
+                    // UXT s(tmp0), s(0)
+                    concat!(uxt!($bytes), " ", reg!($bytes, 16), ", ", reg!($bytes, 0)),
+                    "0:",
+                    // LDXR   s(0), [x2]
+                    concat!(ldxr!($ordering, $bytes), " ", reg!($bytes, 0), ", [x2]"),
+                    // cmp    s(0), s(tmp0)
+                    concat!("cmp ", reg!($bytes, 0), ", ", reg!($bytes, 16)),
+                    "bne    1f",
+                    // STXR   w(tmp1), s(1), [x2]
+                    concat!(stxr!($ordering, $bytes), " w17, ", reg!($bytes, 1), ", [x2]"),
+                    "cbnz   w17, 0b",
+                    "1:",
+                    "ret",
+                }
+            }
+        }
+    };
+}
+
+// i128 uses a completely different impl, so it has its own macro.
+macro_rules! compare_and_swap_i128 {
+    ($ordering:ident, $name:ident) => {
+        intrinsics! {
+            #[maybe_use_optimized_c_shim]
+            #[unsafe(naked)]
+            pub unsafe extern "C" fn $name (
+                expected: i128, desired: i128, ptr: *mut i128
+            ) -> i128 {
+                core::arch::naked_asm! {
+                    "mov    x16, x0",
+                    "mov    x17, x1",
+                    "0:",
+                    // LDXP   x0, x1, [x4]
+                    concat!(ldxp!($ordering), " x0, x1, [x4]"),
+                    "cmp    x0, x16",
+                    "ccmp   x1, x17, #0, eq",
+                    "bne    1f",
+                    // STXP   w(tmp2), x2, x3, [x4]
+                    concat!(stxp!($ordering), " w15, x2, x3, [x4]"),
+                    "cbnz   w15, 0b",
+                    "1:",
+                    "ret",
+                }
+            }
+        }
+    };
+}
+
+/// See <https://doc.rust-lang.org/stable/std/sync/atomic/struct.AtomicI8.html#method.swap>.
+macro_rules! swap {
+    ($ordering:ident, $bytes:tt, $name:ident) => {
+        intrinsics! {
+            #[maybe_use_optimized_c_shim]
+            #[unsafe(naked)]
+            pub unsafe extern "C" fn $name (
+                left: int_ty!($bytes), right_ptr: *mut int_ty!($bytes)
+            ) -> int_ty!($bytes) {
+                core::arch::naked_asm! {
+                    // mov    s(tmp0), s(0)
+                    concat!("mov ", reg!($bytes, 16), ", ", reg!($bytes, 0)),
+                    "0:",
+                    // LDXR   s(0), [x1]
+                    concat!(ldxr!($ordering, $bytes), " ", reg!($bytes, 0), ", [x1]"),
+                    // STXR   w(tmp1), s(tmp0), [x1]
+                    concat!(stxr!($ordering, $bytes), " w17, ", reg!($bytes, 16), ", [x1]"),
+                    "cbnz   w17, 0b",
+                    "ret",
+                }
+            }
+        }
+    };
+}
+
+/// See (e.g.) <https://doc.rust-lang.org/stable/std/sync/atomic/struct.AtomicI8.html#method.fetch_add>.
+macro_rules! fetch_op {
+    ($ordering:ident, $bytes:tt, $name:ident, $op:literal) => {
+        intrinsics! {
+            #[maybe_use_optimized_c_shim]
+            #[unsafe(naked)]
+            pub unsafe extern "C" fn $name (
+                val: int_ty!($bytes), ptr: *mut int_ty!($bytes)
+            ) -> int_ty!($bytes) {
+                core::arch::naked_asm! {
+                    // mov    s(tmp0), s(0)
+                    concat!("mov ", reg!($bytes, 16), ", ", reg!($bytes, 0)),
+                    "0:",
+                    // LDXR   s(0), [x1]
+                    concat!(ldxr!($ordering, $bytes), " ", reg!($bytes, 0), ", [x1]"),
+                    // OP     s(tmp1), s(0), s(tmp0)
+                    concat!($op, " ", reg!($bytes, 17), ", ", reg!($bytes, 0), ", ", reg!($bytes, 16)),
+                    // STXR   w(tmp2), s(tmp1), [x1]
+                    concat!(stxr!($ordering, $bytes), " w15, ", reg!($bytes, 17), ", [x1]"),
+                    "cbnz  w15, 0b",
+                    "ret",
+                }
+            }
+        }
+    }
+}
+
+// We need a single macro to pass to `foreach_ldadd`.
+macro_rules! add {
+    ($ordering:ident, $bytes:tt, $name:ident) => {
+        fetch_op! { $ordering, $bytes, $name, "add" }
+    };
+}
+
+macro_rules! and {
+    ($ordering:ident, $bytes:tt, $name:ident) => {
+        fetch_op! { $ordering, $bytes, $name, "bic" }
+    };
+}
+
+macro_rules! xor {
+    ($ordering:ident, $bytes:tt, $name:ident) => {
+        fetch_op! { $ordering, $bytes, $name, "eor" }
+    };
+}
+
+macro_rules! or {
+    ($ordering:ident, $bytes:tt, $name:ident) => {
+        fetch_op! { $ordering, $bytes, $name, "orr" }
+    };
+}
+
+// See `generate_aarch64_outlined_atomics` in build.rs.
+include!(concat!(env!("OUT_DIR"), "/outlined_atomics.rs"));
+foreach_cas!(compare_and_swap);
+foreach_cas16!(compare_and_swap_i128);
+foreach_swp!(swap);
+foreach_ldadd!(add);
+foreach_ldclr!(and);
+foreach_ldeor!(xor);
+foreach_ldset!(or);
--- a/library/compiler-builtins/compiler-builtins/src/arm.rs
+++ b/library/compiler-builtins/compiler-builtins/src/arm.rs
@ -0,0 +1,280 @@
+#![cfg(not(feature = "no-asm"))]
+
+// Interfaces used by naked trampolines.
+extern "C" {
+    fn __udivmodsi4(a: u32, b: u32, rem: *mut u32) -> u32;
+    fn __udivmoddi4(a: u64, b: u64, rem: *mut u64) -> u64;
+    fn __divmoddi4(a: i64, b: i64, rem: *mut i64) -> i64;
+}
+
+extern "aapcs" {
+    // AAPCS is not always the correct ABI for these intrinsics, but we only use this to
+    // forward another `__aeabi_` call so it doesn't matter.
+    fn __aeabi_idiv(a: i32, b: i32) -> i32;
+}
+
+intrinsics! {
+    // NOTE This function and the ones below are implemented using assembly because they are using a
+    // custom calling convention which can't be implemented using a normal Rust function.
+    #[unsafe(naked)]
+    #[cfg(not(target_env = "msvc"))]
+    pub unsafe extern "C" fn __aeabi_uidivmod() {
+        core::arch::naked_asm!(
+            "push {{lr}}",
+            "sub sp, sp, #4",
+            "mov r2, sp",
+            "bl {trampoline}",
+            "ldr r1, [sp]",
+            "add sp, sp, #4",
+            "pop {{pc}}",
+            trampoline = sym crate::arm::__udivmodsi4
+        );
+    }
+
+    #[unsafe(naked)]
+    pub unsafe extern "C" fn __aeabi_uldivmod() {
+        core::arch::naked_asm!(
+            "push {{r4, lr}}",
+            "sub sp, sp, #16",
+            "add r4, sp, #8",
+            "str r4, [sp]",
+            "bl {trampoline}",
+            "ldr r2, [sp, #8]",
+            "ldr r3, [sp, #12]",
+            "add sp, sp, #16",
+            "pop {{r4, pc}}",
+            trampoline = sym crate::arm::__udivmoddi4
+        );
+    }
+
+    #[unsafe(naked)]
+    pub unsafe extern "C" fn __aeabi_idivmod() {
+        core::arch::naked_asm!(
+            "push {{r0, r1, r4, lr}}",
+            "bl {trampoline}",
+            "pop {{r1, r2}}",
+            "muls r2, r2, r0",
+            "subs r1, r1, r2",
+            "pop {{r4, pc}}",
+            trampoline = sym crate::arm::__aeabi_idiv,
+        );
+    }
+
+    #[unsafe(naked)]
+    pub unsafe extern "C" fn __aeabi_ldivmod() {
+        core::arch::naked_asm!(
+            "push {{r4, lr}}",
+            "sub sp, sp, #16",
+            "add r4, sp, #8",
+            "str r4, [sp]",
+            "bl {trampoline}",
+            "ldr r2, [sp, #8]",
+            "ldr r3, [sp, #12]",
+            "add sp, sp, #16",
+            "pop {{r4, pc}}",
+            trampoline = sym crate::arm::__divmoddi4,
+        );
+    }
+
+    // FIXME(arm): The `*4` and `*8` variants should be defined as aliases.
+
+    /// `memcpy` provided with the `aapcs` ABI.
+    ///
+    /// # Safety
+    ///
+    /// Usual `memcpy` requirements apply.
+    #[cfg(not(target_vendor = "apple"))]
+    pub unsafe extern "aapcs" fn __aeabi_memcpy(dst: *mut u8, src: *const u8, n: usize) {
+        // SAFETY: memcpy preconditions apply.
+        unsafe { crate::mem::memcpy(dst, src, n) };
+    }
+
+    /// `memcpy` for 4-byte alignment.
+    ///
+    /// # Safety
+    ///
+    /// Usual `memcpy` requirements apply. Additionally, `dest` and `src` must be aligned to
+    /// four bytes.
+    #[cfg(not(target_vendor = "apple"))]
+    pub unsafe extern "aapcs" fn __aeabi_memcpy4(dst: *mut u8, src: *const u8, n: usize) {
+        // We are guaranteed 4-alignment, so accessing at u32 is okay.
+        let mut dst = dst.cast::<u32>();
+        let mut src = src.cast::<u32>();
+        debug_assert!(dst.is_aligned());
+        debug_assert!(src.is_aligned());
+        let mut n = n;
+
+        while n >= 4 {
+            // SAFETY: `dst` and `src` are both valid for at least 4 bytes, from
+            // `memcpy` preconditions and the loop guard.
+            unsafe { *dst = *src };
+
+            // FIXME(addr): if we can make this end-of-address-space safe without losing
+            // performance, we may want to consider that.
+            // SAFETY: memcpy is not expected to work at the end of the address space
+            unsafe {
+                dst = dst.offset(1);
+                src = src.offset(1);
+            }
+
+            n -= 4;
+        }
+
+        // SAFETY: `dst` and `src` will still be valid for `n` bytes
+        unsafe { __aeabi_memcpy(dst.cast::<u8>(), src.cast::<u8>(), n) };
+    }
+
+    /// `memcpy` for 8-byte alignment.
+    ///
+    /// # Safety
+    ///
+    /// Usual `memcpy` requirements apply. Additionally, `dest` and `src` must be aligned to
+    /// eight bytes.
+    #[cfg(not(target_vendor = "apple"))]
+    pub unsafe extern "aapcs" fn __aeabi_memcpy8(dst: *mut u8, src: *const u8, n: usize) {
+        debug_assert!(dst.addr() & 7 == 0);
+        debug_assert!(src.addr() & 7 == 0);
+
+        // SAFETY: memcpy preconditions apply, less strict alignment.
+        unsafe { __aeabi_memcpy4(dst, src, n) };
+    }
+
+    /// `memmove` provided with the `aapcs` ABI.
+    ///
+    /// # Safety
+    ///
+    /// Usual `memmove` requirements apply.
+    #[cfg(not(target_vendor = "apple"))]
+    pub unsafe extern "aapcs" fn __aeabi_memmove(dst: *mut u8, src: *const u8, n: usize) {
+        // SAFETY: memmove preconditions apply.
+        unsafe { crate::mem::memmove(dst, src, n) };
+    }
+
+    /// `memmove` for 4-byte alignment.
+    ///
+    /// # Safety
+    ///
+    /// Usual `memmove` requirements apply. Additionally, `dest` and `src` must be aligned to
+    /// four bytes.
+    #[cfg(not(any(target_vendor = "apple", target_env = "msvc")))]
+    pub unsafe extern "aapcs" fn __aeabi_memmove4(dst: *mut u8, src: *const u8, n: usize) {
+        debug_assert!(dst.addr() & 3 == 0);
+        debug_assert!(src.addr() & 3 == 0);
+
+        // SAFETY: same preconditions, less strict aligment.
+        unsafe { __aeabi_memmove(dst, src, n) };
+    }
+
+    /// `memmove` for 8-byte alignment.
+    ///
+    /// # Safety
+    ///
+    /// Usual `memmove` requirements apply. Additionally, `dst` and `src` must be aligned to
+    /// eight bytes.
+    #[cfg(not(any(target_vendor = "apple", target_env = "msvc")))]
+    pub unsafe extern "aapcs" fn __aeabi_memmove8(dst: *mut u8, src: *const u8, n: usize) {
+        debug_assert!(dst.addr() & 7 == 0);
+        debug_assert!(src.addr() & 7 == 0);
+
+        // SAFETY: memmove preconditions apply, less strict alignment.
+        unsafe { __aeabi_memmove(dst, src, n) };
+    }
+
+    /// `memset` provided with the `aapcs` ABI.
+    ///
+    /// # Safety
+    ///
+    /// Usual `memset` requirements apply.
+    #[cfg(not(target_vendor = "apple"))]
+    pub unsafe extern "aapcs" fn __aeabi_memset(dst: *mut u8, n: usize, c: i32) {
+        // Note the different argument order
+        // SAFETY: memset preconditions apply.
+        unsafe { crate::mem::memset(dst, c, n) };
+    }
+
+    /// `memset` for 4-byte alignment.
+    ///
+    /// # Safety
+    ///
+    /// Usual `memset` requirements apply. Additionally, `dest` and `src` must be aligned to
+    /// four bytes.
+    #[cfg(not(target_vendor = "apple"))]
+    pub unsafe extern "aapcs" fn __aeabi_memset4(dst: *mut u8, n: usize, c: i32) {
+        let mut dst = dst.cast::<u32>();
+        debug_assert!(dst.is_aligned());
+        let mut n = n;
+
+        let byte = (c as u32) & 0xff;
+        let c = (byte << 24) | (byte << 16) | (byte << 8) | byte;
+
+        while n >= 4 {
+            // SAFETY: `dst` is valid for at least 4 bytes, from `memset` preconditions and
+            // the loop guard.
+            unsafe { *dst = c };
+
+            // FIXME(addr): if we can make this end-of-address-space safe without losing
+            // performance, we may want to consider that.
+            // SAFETY: memcpy is not expected to work at the end of the address space
+            unsafe {
+                dst = dst.offset(1);
+            }
+            n -= 4;
+        }
+
+        // SAFETY: `dst` will still be valid for `n` bytes
+        unsafe { __aeabi_memset(dst.cast::<u8>(), n, byte as i32) };
+    }
+
+    /// `memset` for 8-byte alignment.
+    ///
+    /// # Safety
+    ///
+    /// Usual `memset` requirements apply. Additionally, `dst` and `src` must be aligned to
+    /// eight bytes.
+    #[cfg(not(target_vendor = "apple"))]
+    pub unsafe extern "aapcs" fn __aeabi_memset8(dst: *mut u8, n: usize, c: i32) {
+        debug_assert!(dst.addr() & 7 == 0);
+
+        // SAFETY: memset preconditions apply, less strict alignment.
+        unsafe { __aeabi_memset4(dst, n, c) };
+    }
+
+    /// `memclr` provided with the `aapcs` ABI.
+    ///
+    /// # Safety
+    ///
+    /// Usual `memclr` requirements apply.
+    #[cfg(not(target_vendor = "apple"))]
+    pub unsafe extern "aapcs" fn __aeabi_memclr(dst: *mut u8, n: usize) {
+        // SAFETY: memclr preconditions apply, less strict alignment.
+        unsafe { __aeabi_memset(dst, n, 0) };
+    }
+
+    /// `memclr` for 4-byte alignment.
+    ///
+    /// # Safety
+    ///
+    /// Usual `memclr` requirements apply. Additionally, `dest` and `src` must be aligned to
+    /// four bytes.
+    #[cfg(not(any(target_vendor = "apple", target_env = "msvc")))]
+    pub unsafe extern "aapcs" fn __aeabi_memclr4(dst: *mut u8, n: usize) {
+        debug_assert!(dst.addr() & 3 == 0);
+
+        // SAFETY: memclr preconditions apply, less strict alignment.
+        unsafe { __aeabi_memset4(dst, n, 0) };
+    }
+
+    /// `memclr` for 8-byte alignment.
+    ///
+    /// # Safety
+    ///
+    /// Usual `memclr` requirements apply. Additionally, `dst` and `src` must be aligned to
+    /// eight bytes.
+    #[cfg(not(any(target_vendor = "apple", target_env = "msvc")))]
+    pub unsafe extern "aapcs" fn __aeabi_memclr8(dst: *mut u8, n: usize) {
+        debug_assert!(dst.addr() & 7 == 0);
+
+        // SAFETY: memclr preconditions apply, less strict alignment.
+        unsafe { __aeabi_memset4(dst, n, 0) };
+    }
+}
--- a/library/compiler-builtins/compiler-builtins/src/arm_linux.rs
+++ b/library/compiler-builtins/compiler-builtins/src/arm_linux.rs
@ -0,0 +1,290 @@
+use core::sync::atomic::{AtomicU32, Ordering};
+use core::{arch, mem};
+
+// Kernel-provided user-mode helper functions:
+// https://www.kernel.org/doc/Documentation/arm/kernel_user_helpers.txt
+unsafe fn __kuser_cmpxchg(oldval: u32, newval: u32, ptr: *mut u32) -> bool {
+    let f: extern "C" fn(u32, u32, *mut u32) -> u32 = mem::transmute(0xffff0fc0usize as *const ());
+    f(oldval, newval, ptr) == 0
+}
+
+unsafe fn __kuser_memory_barrier() {
+    let f: extern "C" fn() = mem::transmute(0xffff0fa0usize as *const ());
+    f();
+}
+
+// Word-align a pointer
+fn align_ptr<T>(ptr: *mut T) -> *mut u32 {
+    // This gives us a mask of 0 when T == u32 since the pointer is already
+    // supposed to be aligned, which avoids any masking in that case.
+    let ptr_mask = 3 & (4 - mem::size_of::<T>());
+    (ptr as usize & !ptr_mask) as *mut u32
+}
+
+// Calculate the shift and mask of a value inside an aligned word
+fn get_shift_mask<T>(ptr: *mut T) -> (u32, u32) {
+    // Mask to get the low byte/halfword/word
+    let mask = match mem::size_of::<T>() {
+        1 => 0xff,
+        2 => 0xffff,
+        4 => 0xffffffff,
+        _ => unreachable!(),
+    };
+
+    // If we are on big-endian then we need to adjust the shift accordingly
+    let endian_adjust = if cfg!(target_endian = "little") {
+        0
+    } else {
+        4 - mem::size_of::<T>() as u32
+    };
+
+    // Shift to get the desired element in the word
+    let ptr_mask = 3 & (4 - mem::size_of::<T>());
+    let shift = ((ptr as usize & ptr_mask) as u32 ^ endian_adjust) * 8;
+
+    (shift, mask)
+}
+
+// Extract a value from an aligned word
+fn extract_aligned(aligned: u32, shift: u32, mask: u32) -> u32 {
+    (aligned >> shift) & mask
+}
+
+// Insert a value into an aligned word
+fn insert_aligned(aligned: u32, val: u32, shift: u32, mask: u32) -> u32 {
+    (aligned & !(mask << shift)) | ((val & mask) << shift)
+}
+
+/// Performs a relaxed atomic load of 4 bytes at `ptr`. Some of the bytes are allowed to be out of
+/// bounds as long as `size_of::<T>()` bytes are in bounds.
+///
+/// # Safety
+///
+/// - `ptr` must be 4-aligned.
+/// - `size_of::<T>()` must be at most 4.
+/// - if `size_of::<T>() == 1`, `ptr` or `ptr` offset by 1, 2 or 3 bytes must be valid for a relaxed
+///   atomic read of 1 byte.
+/// - if `size_of::<T>() == 2`, `ptr` or `ptr` offset by 2 bytes must be valid for a relaxed atomic
+///   read of 2 bytes.
+/// - if `size_of::<T>() == 4`, `ptr` must be valid for a relaxed atomic read of 4 bytes.
+unsafe fn atomic_load_aligned<T>(ptr: *mut u32) -> u32 {
+    if mem::size_of::<T>() == 4 {
+        // SAFETY: As `T` has a size of 4, the caller garantees this is sound.
+        unsafe { AtomicU32::from_ptr(ptr).load(Ordering::Relaxed) }
+    } else {
+        // SAFETY:
+        // As all 4 bytes pointed to by `ptr` might not be dereferenceable due to being out of
+        // bounds when doing atomic operations on a `u8`/`i8`/`u16`/`i16`, inline ASM is used to
+        // avoid causing undefined behaviour. However, as `ptr` is 4-aligned and at least 1 byte of
+        // `ptr` is dereferencable, the load won't cause a segfault as the page size is always
+        // larger than 4 bytes.
+        // The `ldr` instruction does not touch the stack or flags, or write to memory, so
+        // `nostack`, `preserves_flags` and `readonly` are sound. The caller garantees that `ptr` is
+        // 4-aligned, as required by `ldr`.
+        unsafe {
+            let res: u32;
+            arch::asm!(
+                "ldr {res}, [{ptr}]",
+                ptr = in(reg) ptr,
+                res = lateout(reg) res,
+                options(nostack, preserves_flags, readonly)
+            );
+            res
+        }
+    }
+}
+
+// Generic atomic read-modify-write operation
+unsafe fn atomic_rmw<T, F: Fn(u32) -> u32, G: Fn(u32, u32) -> u32>(ptr: *mut T, f: F, g: G) -> u32 {
+    let aligned_ptr = align_ptr(ptr);
+    let (shift, mask) = get_shift_mask(ptr);
+
+    loop {
+        let curval_aligned = atomic_load_aligned::<T>(aligned_ptr);
+        let curval = extract_aligned(curval_aligned, shift, mask);
+        let newval = f(curval);
+        let newval_aligned = insert_aligned(curval_aligned, newval, shift, mask);
+        if __kuser_cmpxchg(curval_aligned, newval_aligned, aligned_ptr) {
+            return g(curval, newval);
+        }
+    }
+}
+
+// Generic atomic compare-exchange operation
+unsafe fn atomic_cmpxchg<T>(ptr: *mut T, oldval: u32, newval: u32) -> u32 {
+    let aligned_ptr = align_ptr(ptr);
+    let (shift, mask) = get_shift_mask(ptr);
+
+    loop {
+        let curval_aligned = atomic_load_aligned::<T>(aligned_ptr);
+        let curval = extract_aligned(curval_aligned, shift, mask);
+        if curval != oldval {
+            return curval;
+        }
+        let newval_aligned = insert_aligned(curval_aligned, newval, shift, mask);
+        if __kuser_cmpxchg(curval_aligned, newval_aligned, aligned_ptr) {
+            return oldval;
+        }
+    }
+}
+
+macro_rules! atomic_rmw {
+    ($name:ident, $ty:ty, $op:expr, $fetch:expr) => {
+        intrinsics! {
+            pub unsafe extern "C" fn $name(ptr: *mut $ty, val: $ty) -> $ty {
+                atomic_rmw(ptr, |x| $op(x as $ty, val) as u32, |old, new| $fetch(old, new)) as $ty
+            }
+        }
+    };
+
+    (@old $name:ident, $ty:ty, $op:expr) => {
+        atomic_rmw!($name, $ty, $op, |old, _| old);
+    };
+
+    (@new $name:ident, $ty:ty, $op:expr) => {
+        atomic_rmw!($name, $ty, $op, |_, new| new);
+    };
+}
+macro_rules! atomic_cmpxchg {
+    ($name:ident, $ty:ty) => {
+        intrinsics! {
+            pub unsafe extern "C" fn $name(ptr: *mut $ty, oldval: $ty, newval: $ty) -> $ty {
+                atomic_cmpxchg(ptr, oldval as u32, newval as u32) as $ty
+            }
+        }
+    };
+}
+
+atomic_rmw!(@old __sync_fetch_and_add_1, u8, |a: u8, b: u8| a.wrapping_add(b));
+atomic_rmw!(@old __sync_fetch_and_add_2, u16, |a: u16, b: u16| a
+    .wrapping_add(b));
+atomic_rmw!(@old __sync_fetch_and_add_4, u32, |a: u32, b: u32| a
+    .wrapping_add(b));
+
+atomic_rmw!(@new __sync_add_and_fetch_1, u8, |a: u8, b: u8| a.wrapping_add(b));
+atomic_rmw!(@new __sync_add_and_fetch_2, u16, |a: u16, b: u16| a
+    .wrapping_add(b));
+atomic_rmw!(@new __sync_add_and_fetch_4, u32, |a: u32, b: u32| a
+    .wrapping_add(b));
+
+atomic_rmw!(@old __sync_fetch_and_sub_1, u8, |a: u8, b: u8| a.wrapping_sub(b));
+atomic_rmw!(@old __sync_fetch_and_sub_2, u16, |a: u16, b: u16| a
+    .wrapping_sub(b));
+atomic_rmw!(@old __sync_fetch_and_sub_4, u32, |a: u32, b: u32| a
+    .wrapping_sub(b));
+
+atomic_rmw!(@new __sync_sub_and_fetch_1, u8, |a: u8, b: u8| a.wrapping_sub(b));
+atomic_rmw!(@new __sync_sub_and_fetch_2, u16, |a: u16, b: u16| a
+    .wrapping_sub(b));
+atomic_rmw!(@new __sync_sub_and_fetch_4, u32, |a: u32, b: u32| a
+    .wrapping_sub(b));
+
+atomic_rmw!(@old __sync_fetch_and_and_1, u8, |a: u8, b: u8| a & b);
+atomic_rmw!(@old __sync_fetch_and_and_2, u16, |a: u16, b: u16| a & b);
+atomic_rmw!(@old __sync_fetch_and_and_4, u32, |a: u32, b: u32| a & b);
+
+atomic_rmw!(@new __sync_and_and_fetch_1, u8, |a: u8, b: u8| a & b);
+atomic_rmw!(@new __sync_and_and_fetch_2, u16, |a: u16, b: u16| a & b);
+atomic_rmw!(@new __sync_and_and_fetch_4, u32, |a: u32, b: u32| a & b);
+
+atomic_rmw!(@old __sync_fetch_and_or_1, u8, |a: u8, b: u8| a | b);
+atomic_rmw!(@old __sync_fetch_and_or_2, u16, |a: u16, b: u16| a | b);
+atomic_rmw!(@old __sync_fetch_and_or_4, u32, |a: u32, b: u32| a | b);
+
+atomic_rmw!(@new __sync_or_and_fetch_1, u8, |a: u8, b: u8| a | b);
+atomic_rmw!(@new __sync_or_and_fetch_2, u16, |a: u16, b: u16| a | b);
+atomic_rmw!(@new __sync_or_and_fetch_4, u32, |a: u32, b: u32| a | b);
+
+atomic_rmw!(@old __sync_fetch_and_xor_1, u8, |a: u8, b: u8| a ^ b);
+atomic_rmw!(@old __sync_fetch_and_xor_2, u16, |a: u16, b: u16| a ^ b);
+atomic_rmw!(@old __sync_fetch_and_xor_4, u32, |a: u32, b: u32| a ^ b);
+
+atomic_rmw!(@new __sync_xor_and_fetch_1, u8, |a: u8, b: u8| a ^ b);
+atomic_rmw!(@new __sync_xor_and_fetch_2, u16, |a: u16, b: u16| a ^ b);
+atomic_rmw!(@new __sync_xor_and_fetch_4, u32, |a: u32, b: u32| a ^ b);
+
+atomic_rmw!(@old __sync_fetch_and_nand_1, u8, |a: u8, b: u8| !(a & b));
+atomic_rmw!(@old __sync_fetch_and_nand_2, u16, |a: u16, b: u16| !(a & b));
+atomic_rmw!(@old __sync_fetch_and_nand_4, u32, |a: u32, b: u32| !(a & b));
+
+atomic_rmw!(@new __sync_nand_and_fetch_1, u8, |a: u8, b: u8| !(a & b));
+atomic_rmw!(@new __sync_nand_and_fetch_2, u16, |a: u16, b: u16| !(a & b));
+atomic_rmw!(@new __sync_nand_and_fetch_4, u32, |a: u32, b: u32| !(a & b));
+
+atomic_rmw!(@old __sync_fetch_and_max_1, i8, |a: i8, b: i8| if a > b {
+    a
+} else {
+    b
+});
+atomic_rmw!(@old __sync_fetch_and_max_2, i16, |a: i16, b: i16| if a > b {
+    a
+} else {
+    b
+});
+atomic_rmw!(@old __sync_fetch_and_max_4, i32, |a: i32, b: i32| if a > b {
+    a
+} else {
+    b
+});
+
+atomic_rmw!(@old __sync_fetch_and_umax_1, u8, |a: u8, b: u8| if a > b {
+    a
+} else {
+    b
+});
+atomic_rmw!(@old __sync_fetch_and_umax_2, u16, |a: u16, b: u16| if a > b {
+    a
+} else {
+    b
+});
+atomic_rmw!(@old __sync_fetch_and_umax_4, u32, |a: u32, b: u32| if a > b {
+    a
+} else {
+    b
+});
+
+atomic_rmw!(@old __sync_fetch_and_min_1, i8, |a: i8, b: i8| if a < b {
+    a
+} else {
+    b
+});
+atomic_rmw!(@old __sync_fetch_and_min_2, i16, |a: i16, b: i16| if a < b {
+    a
+} else {
+    b
+});
+atomic_rmw!(@old __sync_fetch_and_min_4, i32, |a: i32, b: i32| if a < b {
+    a
+} else {
+    b
+});
+
+atomic_rmw!(@old __sync_fetch_and_umin_1, u8, |a: u8, b: u8| if a < b {
+    a
+} else {
+    b
+});
+atomic_rmw!(@old __sync_fetch_and_umin_2, u16, |a: u16, b: u16| if a < b {
+    a
+} else {
+    b
+});
+atomic_rmw!(@old __sync_fetch_and_umin_4, u32, |a: u32, b: u32| if a < b {
+    a
+} else {
+    b
+});
+
+atomic_rmw!(@old __sync_lock_test_and_set_1, u8, |_: u8, b: u8| b);
+atomic_rmw!(@old __sync_lock_test_and_set_2, u16, |_: u16, b: u16| b);
+atomic_rmw!(@old __sync_lock_test_and_set_4, u32, |_: u32, b: u32| b);
+
+atomic_cmpxchg!(__sync_val_compare_and_swap_1, u8);
+atomic_cmpxchg!(__sync_val_compare_and_swap_2, u16);
+atomic_cmpxchg!(__sync_val_compare_and_swap_4, u32);
+
+intrinsics! {
+    pub unsafe extern "C" fn __sync_synchronize() {
+        __kuser_memory_barrier();
+    }
+}
--- a/library/compiler-builtins/compiler-builtins/src/avr.rs
+++ b/library/compiler-builtins/compiler-builtins/src/avr.rs
@ -0,0 +1,23 @@
+intrinsics! {
+    pub unsafe extern "C" fn abort() -> ! {
+        // On AVRs, an architecture that doesn't support traps, unreachable code
+        // paths get lowered into calls to `abort`:
+        //
+        // https://github.com/llvm/llvm-project/blob/cbe8f3ad7621e402b050e768f400ff0d19c3aedd/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp#L4462
+        //
+        // When control gets here, it means that either core::intrinsics::abort()
+        // was called or an undefined bebavior has occurred, so there's not that
+        // much we can do to recover - we can't `panic!()`, because for all we
+        // know the environment is gone now, so panicking might end up with us
+        // getting back to this very function.
+        //
+        // So let's do the next best thing, loop.
+        //
+        // Alternatively we could (try to) restart the program, but since
+        // undefined behavior is undefined, there's really no obligation for us
+        // to do anything here - for all we care, we could just set the chip on
+        // fire; but that'd be bad for the environment.
+
+        loop {}
+    }
+}
--- a/library/compiler-builtins/compiler-builtins/src/float/add.rs
+++ b/library/compiler-builtins/compiler-builtins/src/float/add.rs
@ -0,0 +1,209 @@
+use crate::float::Float;
+use crate::int::{CastInto, Int, MinInt};
+
+/// Returns `a + b`
+fn add<F: Float>(a: F, b: F) -> F
+where
+    u32: CastInto<F::Int>,
+    F::Int: CastInto<u32>,
+    i32: CastInto<F::Int>,
+    F::Int: CastInto<i32>,
+{
+    let one = F::Int::ONE;
+    let zero = F::Int::ZERO;
+
+    let bits = F::BITS.cast();
+    let significand_bits = F::SIG_BITS;
+    let max_exponent = F::EXP_SAT;
+
+    let implicit_bit = F::IMPLICIT_BIT;
+    let significand_mask = F::SIG_MASK;
+    let sign_bit = F::SIGN_MASK as F::Int;
+    let abs_mask = sign_bit - one;
+    let exponent_mask = F::EXP_MASK;
+    let inf_rep = exponent_mask;
+    let quiet_bit = implicit_bit >> 1;
+    let qnan_rep = exponent_mask | quiet_bit;
+
+    let mut a_rep = a.to_bits();
+    let mut b_rep = b.to_bits();
+    let a_abs = a_rep & abs_mask;
+    let b_abs = b_rep & abs_mask;
+
+    // Detect if a or b is zero, infinity, or NaN.
+    if a_abs.wrapping_sub(one) >= inf_rep - one || b_abs.wrapping_sub(one) >= inf_rep - one {
+        // NaN + anything = qNaN
+        if a_abs > inf_rep {
+            return F::from_bits(a_abs | quiet_bit);
+        }
+        // anything + NaN = qNaN
+        if b_abs > inf_rep {
+            return F::from_bits(b_abs | quiet_bit);
+        }
+
+        if a_abs == inf_rep {
+            // +/-infinity + -/+infinity = qNaN
+            if (a.to_bits() ^ b.to_bits()) == sign_bit {
+                return F::from_bits(qnan_rep);
+            } else {
+                // +/-infinity + anything remaining = +/- infinity
+                return a;
+            }
+        }
+
+        // anything remaining + +/-infinity = +/-infinity
+        if b_abs == inf_rep {
+            return b;
+        }
+
+        // zero + anything = anything
+        if a_abs == MinInt::ZERO {
+            // but we need to get the sign right for zero + zero
+            if b_abs == MinInt::ZERO {
+                return F::from_bits(a.to_bits() & b.to_bits());
+            } else {
+                return b;
+            }
+        }
+
+        // anything + zero = anything
+        if b_abs == MinInt::ZERO {
+            return a;
+        }
+    }
+
+    // Swap a and b if necessary so that a has the larger absolute value.
+    if b_abs > a_abs {
+        // Don't use mem::swap because it may generate references to memcpy in unoptimized code.
+        let tmp = a_rep;
+        a_rep = b_rep;
+        b_rep = tmp;
+    }
+
+    // Extract the exponent and significand from the (possibly swapped) a and b.
+    let mut a_exponent: i32 = ((a_rep & exponent_mask) >> significand_bits).cast();
+    let mut b_exponent: i32 = ((b_rep & exponent_mask) >> significand_bits).cast();
+    let mut a_significand = a_rep & significand_mask;
+    let mut b_significand = b_rep & significand_mask;
+
+    // normalize any denormals, and adjust the exponent accordingly.
+    if a_exponent == 0 {
+        let (exponent, significand) = F::normalize(a_significand);
+        a_exponent = exponent;
+        a_significand = significand;
+    }
+    if b_exponent == 0 {
+        let (exponent, significand) = F::normalize(b_significand);
+        b_exponent = exponent;
+        b_significand = significand;
+    }
+
+    // The sign of the result is the sign of the larger operand, a.  If they
+    // have opposite signs, we are performing a subtraction; otherwise addition.
+    let result_sign = a_rep & sign_bit;
+    let subtraction = ((a_rep ^ b_rep) & sign_bit) != zero;
+
+    // Shift the significands to give us round, guard and sticky, and or in the
+    // implicit significand bit.  (If we fell through from the denormal path it
+    // was already set by normalize(), but setting it twice won't hurt
+    // anything.)
+    a_significand = (a_significand | implicit_bit) << 3;
+    b_significand = (b_significand | implicit_bit) << 3;
+
+    // Shift the significand of b by the difference in exponents, with a sticky
+    // bottom bit to get rounding correct.
+    let align = a_exponent.wrapping_sub(b_exponent).cast();
+    if align != MinInt::ZERO {
+        if align < bits {
+            let sticky =
+                F::Int::from_bool(b_significand << bits.wrapping_sub(align).cast() != MinInt::ZERO);
+            b_significand = (b_significand >> align.cast()) | sticky;
+        } else {
+            b_significand = one; // sticky; b is known to be non-zero.
+        }
+    }
+    if subtraction {
+        a_significand = a_significand.wrapping_sub(b_significand);
+        // If a == -b, return +zero.
+        if a_significand == MinInt::ZERO {
+            return F::from_bits(MinInt::ZERO);
+        }
+
+        // If partial cancellation occured, we need to left-shift the result
+        // and adjust the exponent:
+        if a_significand < implicit_bit << 3 {
+            let shift =
+                a_significand.leading_zeros() as i32 - (implicit_bit << 3).leading_zeros() as i32;
+            a_significand <<= shift;
+            a_exponent -= shift;
+        }
+    } else {
+        // addition
+        a_significand += b_significand;
+
+        // If the addition carried up, we need to right-shift the result and
+        // adjust the exponent:
+        if a_significand & (implicit_bit << 4) != MinInt::ZERO {
+            let sticky = F::Int::from_bool(a_significand & one != MinInt::ZERO);
+            a_significand = (a_significand >> 1) | sticky;
+            a_exponent += 1;
+        }
+    }
+
+    // If we have overflowed the type, return +/- infinity:
+    if a_exponent >= max_exponent as i32 {
+        return F::from_bits(inf_rep | result_sign);
+    }
+
+    if a_exponent <= 0 {
+        // Result is denormal before rounding; the exponent is zero and we
+        // need to shift the significand.
+        let shift = (1 - a_exponent).cast();
+        let sticky =
+            F::Int::from_bool((a_significand << bits.wrapping_sub(shift).cast()) != MinInt::ZERO);
+        a_significand = (a_significand >> shift.cast()) | sticky;
+        a_exponent = 0;
+    }
+
+    // Low three bits are round, guard, and sticky.
+    let a_significand_i32: i32 = a_significand.cast();
+    let round_guard_sticky: i32 = a_significand_i32 & 0x7;
+
+    // Shift the significand into place, and mask off the implicit bit.
+    let mut result = (a_significand >> 3) & significand_mask;
+
+    // Insert the exponent and sign.
+    result |= a_exponent.cast() << significand_bits;
+    result |= result_sign;
+
+    // Final rounding.  The result may overflow to infinity, but that is the
+    // correct result in that case.
+    if round_guard_sticky > 0x4 {
+        result += one;
+    }
+    if round_guard_sticky == 0x4 {
+        result += result & one;
+    }
+
+    F::from_bits(result)
+}
+
+intrinsics! {
+    #[aapcs_on_arm]
+    #[arm_aeabi_alias = __aeabi_fadd]
+    pub extern "C" fn __addsf3(a: f32, b: f32) -> f32 {
+        add(a, b)
+    }
+
+    #[aapcs_on_arm]
+    #[arm_aeabi_alias = __aeabi_dadd]
+    pub extern "C" fn __adddf3(a: f64, b: f64) -> f64 {
+        add(a, b)
+    }
+
+    #[ppc_alias = __addkf3]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __addtf3(a: f128, b: f128) -> f128 {
+        add(a, b)
+    }
+}
--- a/library/compiler-builtins/compiler-builtins/src/float/cmp.rs
+++ b/library/compiler-builtins/compiler-builtins/src/float/cmp.rs
@ -0,0 +1,248 @@
+#![allow(unreachable_code)]
+
+use crate::float::Float;
+use crate::int::MinInt;
+
+// https://github.com/llvm/llvm-project/blob/1e6ba3cd2fe96be00b6ed6ba28b3d9f9271d784d/compiler-rt/lib/builtins/fp_compare_impl.inc#L22
+#[cfg(target_arch = "avr")]
+pub type CmpResult = i8;
+
+// https://github.com/llvm/llvm-project/blob/1e6ba3cd2fe96be00b6ed6ba28b3d9f9271d784d/compiler-rt/lib/builtins/fp_compare_impl.inc#L25
+#[cfg(not(target_arch = "avr"))]
+pub type CmpResult = i32;
+
+#[derive(Clone, Copy)]
+enum Result {
+    Less,
+    Equal,
+    Greater,
+    Unordered,
+}
+
+impl Result {
+    fn to_le_abi(self) -> CmpResult {
+        match self {
+            Result::Less => -1,
+            Result::Equal => 0,
+            Result::Greater => 1,
+            Result::Unordered => 1,
+        }
+    }
+
+    fn to_ge_abi(self) -> CmpResult {
+        match self {
+            Result::Less => -1,
+            Result::Equal => 0,
+            Result::Greater => 1,
+            Result::Unordered => -1,
+        }
+    }
+}
+
+fn cmp<F: Float>(a: F, b: F) -> Result {
+    let one = F::Int::ONE;
+    let zero = F::Int::ZERO;
+    let szero = F::SignedInt::ZERO;
+
+    let sign_bit = F::SIGN_MASK as F::Int;
+    let abs_mask = sign_bit - one;
+    let exponent_mask = F::EXP_MASK;
+    let inf_rep = exponent_mask;
+
+    let a_rep = a.to_bits();
+    let b_rep = b.to_bits();
+    let a_abs = a_rep & abs_mask;
+    let b_abs = b_rep & abs_mask;
+
+    // If either a or b is NaN, they are unordered.
+    if a_abs > inf_rep || b_abs > inf_rep {
+        return Result::Unordered;
+    }
+
+    // If a and b are both zeros, they are equal.
+    if a_abs | b_abs == zero {
+        return Result::Equal;
+    }
+
+    let a_srep = a.to_bits_signed();
+    let b_srep = b.to_bits_signed();
+
+    // If at least one of a and b is positive, we get the same result comparing
+    // a and b as signed integers as we would with a fp_ting-point compare.
+    if a_srep & b_srep >= szero {
+        if a_srep < b_srep {
+            Result::Less
+        } else if a_srep == b_srep {
+            Result::Equal
+        } else {
+            Result::Greater
+        }
+    // Otherwise, both are negative, so we need to flip the sense of the
+    // comparison to get the correct result.  (This assumes a twos- or ones-
+    // complement integer representation; if integers are represented in a
+    // sign-magnitude representation, then this flip is incorrect).
+    } else if a_srep > b_srep {
+        Result::Less
+    } else if a_srep == b_srep {
+        Result::Equal
+    } else {
+        Result::Greater
+    }
+}
+
+fn unord<F: Float>(a: F, b: F) -> bool {
+    let one = F::Int::ONE;
+
+    let sign_bit = F::SIGN_MASK as F::Int;
+    let abs_mask = sign_bit - one;
+    let exponent_mask = F::EXP_MASK;
+    let inf_rep = exponent_mask;
+
+    let a_rep = a.to_bits();
+    let b_rep = b.to_bits();
+    let a_abs = a_rep & abs_mask;
+    let b_abs = b_rep & abs_mask;
+
+    a_abs > inf_rep || b_abs > inf_rep
+}
+
+intrinsics! {
+    pub extern "C" fn __lesf2(a: f32, b: f32) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_le_abi()
+    }
+
+    pub extern "C" fn __gesf2(a: f32, b: f32) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_ge_abi()
+    }
+
+    #[arm_aeabi_alias = __aeabi_fcmpun]
+    pub extern "C" fn __unordsf2(a: f32, b: f32) -> crate::float::cmp::CmpResult {
+        unord(a, b) as crate::float::cmp::CmpResult
+    }
+
+    pub extern "C" fn __eqsf2(a: f32, b: f32) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_le_abi()
+    }
+
+    pub extern "C" fn __ltsf2(a: f32, b: f32) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_le_abi()
+    }
+
+    pub extern "C" fn __nesf2(a: f32, b: f32) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_le_abi()
+    }
+
+    pub extern "C" fn __gtsf2(a: f32, b: f32) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_ge_abi()
+    }
+
+    pub extern "C" fn __ledf2(a: f64, b: f64) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_le_abi()
+    }
+
+    pub extern "C" fn __gedf2(a: f64, b: f64) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_ge_abi()
+    }
+
+    #[arm_aeabi_alias = __aeabi_dcmpun]
+    pub extern "C" fn __unorddf2(a: f64, b: f64) -> crate::float::cmp::CmpResult {
+        unord(a, b) as crate::float::cmp::CmpResult
+    }
+
+    pub extern "C" fn __eqdf2(a: f64, b: f64) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_le_abi()
+    }
+
+    pub extern "C" fn __ltdf2(a: f64, b: f64) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_le_abi()
+    }
+
+    pub extern "C" fn __nedf2(a: f64, b: f64) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_le_abi()
+    }
+
+    pub extern "C" fn __gtdf2(a: f64, b: f64) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_ge_abi()
+    }
+}
+
+#[cfg(f128_enabled)]
+intrinsics! {
+    #[ppc_alias = __lekf2]
+    pub extern "C" fn __letf2(a: f128, b: f128) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_le_abi()
+    }
+
+    #[ppc_alias = __gekf2]
+    pub extern "C" fn __getf2(a: f128, b: f128) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_ge_abi()
+    }
+
+    #[ppc_alias = __unordkf2]
+    pub extern "C" fn __unordtf2(a: f128, b: f128) -> crate::float::cmp::CmpResult {
+        unord(a, b) as crate::float::cmp::CmpResult
+    }
+
+    #[ppc_alias = __eqkf2]
+    pub extern "C" fn __eqtf2(a: f128, b: f128) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_le_abi()
+    }
+
+    #[ppc_alias = __ltkf2]
+    pub extern "C" fn __lttf2(a: f128, b: f128) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_le_abi()
+    }
+
+    #[ppc_alias = __nekf2]
+    pub extern "C" fn __netf2(a: f128, b: f128) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_le_abi()
+    }
+
+    #[ppc_alias = __gtkf2]
+    pub extern "C" fn __gttf2(a: f128, b: f128) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_ge_abi()
+    }
+}
+
+#[cfg(target_arch = "arm")]
+intrinsics! {
+    pub extern "aapcs" fn __aeabi_fcmple(a: f32, b: f32) -> i32 {
+        (__lesf2(a, b) <= 0) as i32
+    }
+
+    pub extern "aapcs" fn __aeabi_fcmpge(a: f32, b: f32) -> i32 {
+        (__gesf2(a, b) >= 0) as i32
+    }
+
+    pub extern "aapcs" fn __aeabi_fcmpeq(a: f32, b: f32) -> i32 {
+        (__eqsf2(a, b) == 0) as i32
+    }
+
+    pub extern "aapcs" fn __aeabi_fcmplt(a: f32, b: f32) -> i32 {
+        (__ltsf2(a, b) < 0) as i32
+    }
+
+    pub extern "aapcs" fn __aeabi_fcmpgt(a: f32, b: f32) -> i32 {
+        (__gtsf2(a, b) > 0) as i32
+    }
+
+    pub extern "aapcs" fn __aeabi_dcmple(a: f64, b: f64) -> i32 {
+        (__ledf2(a, b) <= 0) as i32
+    }
+
+    pub extern "aapcs" fn __aeabi_dcmpge(a: f64, b: f64) -> i32 {
+        (__gedf2(a, b) >= 0) as i32
+    }
+
+    pub extern "aapcs" fn __aeabi_dcmpeq(a: f64, b: f64) -> i32 {
+        (__eqdf2(a, b) == 0) as i32
+    }
+
+    pub extern "aapcs" fn __aeabi_dcmplt(a: f64, b: f64) -> i32 {
+        (__ltdf2(a, b) < 0) as i32
+    }
+
+    pub extern "aapcs" fn __aeabi_dcmpgt(a: f64, b: f64) -> i32 {
+        (__gtdf2(a, b) > 0) as i32
+    }
+}
--- a/library/compiler-builtins/compiler-builtins/src/float/conv.rs
+++ b/library/compiler-builtins/compiler-builtins/src/float/conv.rs
@ -0,0 +1,489 @@
+use core::ops::Neg;
+
+use super::Float;
+use crate::int::{CastFrom, CastInto, Int, MinInt};
+
+/// Conversions from integers to floats.
+///
+/// The algorithm is explained here: <https://blog.m-ou.se/floats/>. It roughly does the following:
+/// - Calculate a base mantissa by shifting the integer into mantissa position. This gives us a
+///   mantissa _with the implicit bit set_!
+/// - Figure out if rounding needs to occur by classifying the bits that are to be truncated. Some
+///   patterns are used to simplify this. Adjust the mantissa with the result if needed.
+/// - Calculate the exponent based on the base-2 logarithm of `i` (leading zeros). Subtract one.
+/// - Shift the exponent and add the mantissa to create the final representation. Subtracting one
+///   from the exponent (above) accounts for the explicit bit being set in the mantissa.
+///
+/// # Terminology
+///
+/// - `i`: the original integer
+/// - `i_m`: the integer, shifted fully left (no leading zeros)
+/// - `n`: number of leading zeroes
+/// - `e`: the resulting exponent. Usually 1 is subtracted to offset the mantissa implicit bit.
+/// - `m_base`: the mantissa before adjusting for truncated bits. Implicit bit is usually set.
+/// - `adj`: the bits that will be truncated, possibly compressed in some way.
+/// - `m`: the resulting mantissa. Implicit bit is usually set.
+mod int_to_float {
+    use super::*;
+
+    /// Calculate the exponent from the number of leading zeros.
+    ///
+    /// Usually 1 is subtracted from this function's result, so that a mantissa with the implicit
+    /// bit set can be added back later.
+    fn exp<I: Int, F: Float<Int: CastFrom<u32>>>(n: u32) -> F::Int {
+        F::Int::cast_from(F::EXP_BIAS - 1 + I::BITS - n)
+    }
+
+    /// Adjust a mantissa with dropped bits to perform correct rounding.
+    ///
+    /// The dropped bits should be exactly the bits that get truncated (left-aligned), but they
+    /// can be combined or compressed in some way that simplifies operations.
+    fn m_adj<F: Float>(m_base: F::Int, dropped_bits: F::Int) -> F::Int {
+        // Branchlessly extract a `1` if rounding up should happen, 0 otherwise
+        // This accounts for rounding to even.
+        let adj = (dropped_bits - ((dropped_bits >> (F::BITS - 1)) & !m_base)) >> (F::BITS - 1);
+
+        // Add one when we need to round up. Break ties to even.
+        m_base + adj
+    }
+
+    /// Shift the exponent to its position and add the mantissa.
+    ///
+    /// If the mantissa has the implicit bit set, the exponent should be one less than its actual
+    /// value to cancel it out.
+    fn repr<F: Float>(e: F::Int, m: F::Int) -> F::Int {
+        // + rather than | so the mantissa can overflow into the exponent
+        (e << F::SIG_BITS) + m
+    }
+
+    /// Shift distance from a left-aligned integer to a smaller float.
+    fn shift_f_lt_i<I: Int, F: Float>() -> u32 {
+        (I::BITS - F::BITS) + F::EXP_BITS
+    }
+
+    /// Shift distance from an integer with `n` leading zeros to a smaller float.
+    fn shift_f_gt_i<I: Int, F: Float>(n: u32) -> u32 {
+        F::SIG_BITS - I::BITS + 1 + n
+    }
+
+    /// Perform a signed operation as unsigned, then add the sign back.
+    pub fn signed<I, F, Conv>(i: I, conv: Conv) -> F
+    where
+        F: Float,
+        I: Int,
+        F::Int: CastFrom<I>,
+        Conv: Fn(I::UnsignedInt) -> F::Int,
+    {
+        let sign_bit = F::Int::cast_from(i >> (I::BITS - 1)) << (F::BITS - 1);
+        F::from_bits(conv(i.unsigned_abs()) | sign_bit)
+    }
+
+    pub fn u32_to_f32_bits(i: u32) -> u32 {
+        if i == 0 {
+            return 0;
+        }
+        let n = i.leading_zeros();
+        // Mantissa with implicit bit set (significant bits)
+        let m_base = (i << n) >> f32::EXP_BITS;
+        // Bits that will be dropped (insignificant bits)
+        let adj = (i << n) << (f32::SIG_BITS + 1);
+        let m = m_adj::<f32>(m_base, adj);
+        let e = exp::<u32, f32>(n) - 1;
+        repr::<f32>(e, m)
+    }
+
+    pub fn u32_to_f64_bits(i: u32) -> u64 {
+        if i == 0 {
+            return 0;
+        }
+        let n = i.leading_zeros();
+        // Mantissa with implicit bit set
+        let m = (i as u64) << shift_f_gt_i::<u32, f64>(n);
+        let e = exp::<u32, f64>(n) - 1;
+        repr::<f64>(e, m)
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn u32_to_f128_bits(i: u32) -> u128 {
+        if i == 0 {
+            return 0;
+        }
+        let n = i.leading_zeros();
+
+        // Shift into mantissa position that is correct for the type, but shifted into the lower
+        // 64 bits over so can can avoid 128-bit math.
+        let m = (i as u64) << (shift_f_gt_i::<u32, f128>(n) - 64);
+        let e = exp::<u32, f128>(n) as u64 - 1;
+        // High 64 bits of f128 representation.
+        let h = (e << (f128::SIG_BITS - 64)) + m;
+
+        // Shift back to the high bits, the rest of the mantissa will always be 0.
+        (h as u128) << 64
+    }
+
+    pub fn u64_to_f32_bits(i: u64) -> u32 {
+        let n = i.leading_zeros();
+        let i_m = i.wrapping_shl(n);
+        // Mantissa with implicit bit set
+        let m_base: u32 = (i_m >> shift_f_lt_i::<u64, f32>()) as u32;
+        // The entire lower half of `i` will be truncated (masked portion), plus the
+        // next `EXP_BITS` bits.
+        let adj = ((i_m >> f32::EXP_BITS) | i_m & 0xFFFF) as u32;
+        let m = m_adj::<f32>(m_base, adj);
+        let e = if i == 0 { 0 } else { exp::<u64, f32>(n) - 1 };
+        repr::<f32>(e, m)
+    }
+
+    pub fn u64_to_f64_bits(i: u64) -> u64 {
+        if i == 0 {
+            return 0;
+        }
+        let n = i.leading_zeros();
+        // Mantissa with implicit bit set
+        let m_base = (i << n) >> f64::EXP_BITS;
+        let adj = (i << n) << (f64::SIG_BITS + 1);
+        let m = m_adj::<f64>(m_base, adj);
+        let e = exp::<u64, f64>(n) - 1;
+        repr::<f64>(e, m)
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn u64_to_f128_bits(i: u64) -> u128 {
+        if i == 0 {
+            return 0;
+        }
+        let n = i.leading_zeros();
+        // Mantissa with implicit bit set
+        let m = (i as u128) << shift_f_gt_i::<u64, f128>(n);
+        let e = exp::<u64, f128>(n) - 1;
+        repr::<f128>(e, m)
+    }
+
+    pub fn u128_to_f32_bits(i: u128) -> u32 {
+        let n = i.leading_zeros();
+        let i_m = i.wrapping_shl(n); // Mantissa, shifted so the first bit is nonzero
+        let m_base: u32 = (i_m >> shift_f_lt_i::<u128, f32>()) as u32;
+
+        // Within the upper `F::BITS`, everything except for the signifcand
+        // gets truncated
+        let d1: u32 = (i_m >> (u128::BITS - f32::BITS - f32::SIG_BITS - 1)).cast();
+
+        // The entire rest of `i_m` gets truncated. Zero the upper `F::BITS` then just
+        // check if it is nonzero.
+        let d2: u32 = (i_m << f32::BITS >> f32::BITS != 0).into();
+        let adj = d1 | d2;
+
+        // Mantissa with implicit bit set
+        let m = m_adj::<f32>(m_base, adj);
+        let e = if i == 0 { 0 } else { exp::<u128, f32>(n) - 1 };
+        repr::<f32>(e, m)
+    }
+
+    pub fn u128_to_f64_bits(i: u128) -> u64 {
+        let n = i.leading_zeros();
+        let i_m = i.wrapping_shl(n);
+        // Mantissa with implicit bit set
+        let m_base: u64 = (i_m >> shift_f_lt_i::<u128, f64>()) as u64;
+        // The entire lower half of `i` will be truncated (masked portion), plus the
+        // next `EXP_BITS` bits.
+        let adj = ((i_m >> f64::EXP_BITS) | i_m & 0xFFFF_FFFF) as u64;
+        let m = m_adj::<f64>(m_base, adj);
+        let e = if i == 0 { 0 } else { exp::<u128, f64>(n) - 1 };
+        repr::<f64>(e, m)
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn u128_to_f128_bits(i: u128) -> u128 {
+        if i == 0 {
+            return 0;
+        }
+        let n = i.leading_zeros();
+        // Mantissa with implicit bit set
+        let m_base = (i << n) >> f128::EXP_BITS;
+        let adj = (i << n) << (f128::SIG_BITS + 1);
+        let m = m_adj::<f128>(m_base, adj);
+        let e = exp::<u128, f128>(n) - 1;
+        repr::<f128>(e, m)
+    }
+}
+
+// Conversions from unsigned integers to floats.
+intrinsics! {
+    #[arm_aeabi_alias = __aeabi_ui2f]
+    pub extern "C" fn __floatunsisf(i: u32) -> f32 {
+        f32::from_bits(int_to_float::u32_to_f32_bits(i))
+    }
+
+    #[arm_aeabi_alias = __aeabi_ui2d]
+    pub extern "C" fn __floatunsidf(i: u32) -> f64 {
+        f64::from_bits(int_to_float::u32_to_f64_bits(i))
+    }
+
+    #[arm_aeabi_alias = __aeabi_ul2f]
+    pub extern "C" fn __floatundisf(i: u64) -> f32 {
+        f32::from_bits(int_to_float::u64_to_f32_bits(i))
+    }
+
+    #[arm_aeabi_alias = __aeabi_ul2d]
+    pub extern "C" fn __floatundidf(i: u64) -> f64 {
+        f64::from_bits(int_to_float::u64_to_f64_bits(i))
+    }
+
+    #[cfg_attr(target_os = "uefi", unadjusted_on_win64)]
+    pub extern "C" fn __floatuntisf(i: u128) -> f32 {
+        f32::from_bits(int_to_float::u128_to_f32_bits(i))
+    }
+
+    #[cfg_attr(target_os = "uefi", unadjusted_on_win64)]
+    pub extern "C" fn __floatuntidf(i: u128) -> f64 {
+        f64::from_bits(int_to_float::u128_to_f64_bits(i))
+    }
+
+    #[ppc_alias = __floatunsikf]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __floatunsitf(i: u32) -> f128 {
+        f128::from_bits(int_to_float::u32_to_f128_bits(i))
+    }
+
+    #[ppc_alias = __floatundikf]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __floatunditf(i: u64) -> f128 {
+        f128::from_bits(int_to_float::u64_to_f128_bits(i))
+    }
+
+    #[ppc_alias = __floatuntikf]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __floatuntitf(i: u128) -> f128 {
+        f128::from_bits(int_to_float::u128_to_f128_bits(i))
+    }
+}
+
+// Conversions from signed integers to floats.
+intrinsics! {
+    #[arm_aeabi_alias = __aeabi_i2f]
+    pub extern "C" fn __floatsisf(i: i32) -> f32 {
+        int_to_float::signed(i, int_to_float::u32_to_f32_bits)
+    }
+
+    #[arm_aeabi_alias = __aeabi_i2d]
+    pub extern "C" fn __floatsidf(i: i32) -> f64 {
+        int_to_float::signed(i, int_to_float::u32_to_f64_bits)
+    }
+
+    #[arm_aeabi_alias = __aeabi_l2f]
+    pub extern "C" fn __floatdisf(i: i64) -> f32 {
+        int_to_float::signed(i, int_to_float::u64_to_f32_bits)
+    }
+
+    #[arm_aeabi_alias = __aeabi_l2d]
+    pub extern "C" fn __floatdidf(i: i64) -> f64 {
+        int_to_float::signed(i, int_to_float::u64_to_f64_bits)
+    }
+
+    #[cfg_attr(target_os = "uefi", unadjusted_on_win64)]
+    pub extern "C" fn __floattisf(i: i128) -> f32 {
+        int_to_float::signed(i, int_to_float::u128_to_f32_bits)
+    }
+
+    #[cfg_attr(target_os = "uefi", unadjusted_on_win64)]
+    pub extern "C" fn __floattidf(i: i128) -> f64 {
+        int_to_float::signed(i, int_to_float::u128_to_f64_bits)
+    }
+
+    #[ppc_alias = __floatsikf]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __floatsitf(i: i32) -> f128 {
+        int_to_float::signed(i, int_to_float::u32_to_f128_bits)
+    }
+
+    #[ppc_alias = __floatdikf]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __floatditf(i: i64) -> f128 {
+        int_to_float::signed(i, int_to_float::u64_to_f128_bits)
+    }
+
+    #[ppc_alias = __floattikf]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __floattitf(i: i128) -> f128 {
+        int_to_float::signed(i, int_to_float::u128_to_f128_bits)
+    }
+}
+
+/// Generic float to unsigned int conversions.
+fn float_to_unsigned_int<F, U>(f: F) -> U
+where
+    F: Float,
+    U: Int<UnsignedInt = U>,
+    F::Int: CastInto<U>,
+    F::Int: CastFrom<u32>,
+    F::Int: CastInto<U::UnsignedInt>,
+    u32: CastFrom<F::Int>,
+{
+    float_to_int_inner::<F, U, _, _>(f.to_bits(), |i: U| i, || U::MAX)
+}
+
+/// Generic float to signed int conversions.
+fn float_to_signed_int<F, I>(f: F) -> I
+where
+    F: Float,
+    I: Int + Neg<Output = I>,
+    I::UnsignedInt: Int,
+    F::Int: CastInto<I::UnsignedInt>,
+    F::Int: CastFrom<u32>,
+    u32: CastFrom<F::Int>,
+{
+    float_to_int_inner::<F, I, _, _>(
+        f.to_bits() & !F::SIGN_MASK,
+        |i: I| if f.is_sign_negative() { -i } else { i },
+        || if f.is_sign_negative() { I::MIN } else { I::MAX },
+    )
+}
+
+/// Float to int conversions, generic for both signed and unsigned.
+///
+/// Parameters:
+/// - `fbits`: `abg(f)` bitcasted to an integer.
+/// - `map_inbounds`: apply this transformation to integers that are within range (add the sign back).
+/// - `out_of_bounds`: return value when out of range for `I`.
+fn float_to_int_inner<F, I, FnFoo, FnOob>(
+    fbits: F::Int,
+    map_inbounds: FnFoo,
+    out_of_bounds: FnOob,
+) -> I
+where
+    F: Float,
+    I: Int,
+    FnFoo: FnOnce(I) -> I,
+    FnOob: FnOnce() -> I,
+    I::UnsignedInt: Int,
+    F::Int: CastInto<I::UnsignedInt>,
+    F::Int: CastFrom<u32>,
+    u32: CastFrom<F::Int>,
+{
+    let int_max_exp = F::EXP_BIAS + I::MAX.ilog2() + 1;
+    let foobar = F::EXP_BIAS + I::UnsignedInt::BITS - 1;
+
+    if fbits < F::ONE.to_bits() {
+        // < 0 gets rounded to 0
+        I::ZERO
+    } else if fbits < F::Int::cast_from(int_max_exp) << F::SIG_BITS {
+        // >= 1, < integer max
+        let m_base = if I::UnsignedInt::BITS >= F::Int::BITS {
+            I::UnsignedInt::cast_from(fbits) << (I::BITS - F::SIG_BITS - 1)
+        } else {
+            I::UnsignedInt::cast_from(fbits >> (F::SIG_BITS - I::BITS + 1))
+        };
+
+        // Set the implicit 1-bit.
+        let m: I::UnsignedInt = (I::UnsignedInt::ONE << (I::BITS - 1)) | m_base;
+
+        // Shift based on the exponent and bias.
+        let s: u32 = (foobar) - u32::cast_from(fbits >> F::SIG_BITS);
+
+        let unsigned = m >> s;
+        map_inbounds(I::from_unsigned(unsigned))
+    } else if fbits <= F::EXP_MASK {
+        // >= max (incl. inf)
+        out_of_bounds()
+    } else {
+        I::ZERO
+    }
+}
+
+// Conversions from floats to unsigned integers.
+intrinsics! {
+    #[arm_aeabi_alias = __aeabi_f2uiz]
+    pub extern "C" fn __fixunssfsi(f: f32) -> u32 {
+        float_to_unsigned_int(f)
+    }
+
+    #[arm_aeabi_alias = __aeabi_f2ulz]
+    pub extern "C" fn __fixunssfdi(f: f32) -> u64 {
+        float_to_unsigned_int(f)
+    }
+
+    pub extern "C" fn __fixunssfti(f: f32) -> u128 {
+        float_to_unsigned_int(f)
+    }
+
+    #[arm_aeabi_alias = __aeabi_d2uiz]
+    pub extern "C" fn __fixunsdfsi(f: f64) -> u32 {
+        float_to_unsigned_int(f)
+    }
+
+    #[arm_aeabi_alias = __aeabi_d2ulz]
+    pub extern "C" fn __fixunsdfdi(f: f64) -> u64 {
+        float_to_unsigned_int(f)
+    }
+
+    pub extern "C" fn __fixunsdfti(f: f64) -> u128 {
+        float_to_unsigned_int(f)
+    }
+
+    #[ppc_alias = __fixunskfsi]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __fixunstfsi(f: f128) -> u32 {
+        float_to_unsigned_int(f)
+    }
+
+    #[ppc_alias = __fixunskfdi]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __fixunstfdi(f: f128) -> u64 {
+        float_to_unsigned_int(f)
+    }
+
+    #[ppc_alias = __fixunskfti]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __fixunstfti(f: f128) -> u128 {
+        float_to_unsigned_int(f)
+    }
+}
+
+// Conversions from floats to signed integers.
+intrinsics! {
+    #[arm_aeabi_alias = __aeabi_f2iz]
+    pub extern "C" fn __fixsfsi(f: f32) -> i32 {
+        float_to_signed_int(f)
+    }
+
+    #[arm_aeabi_alias = __aeabi_f2lz]
+    pub extern "C" fn __fixsfdi(f: f32) -> i64 {
+        float_to_signed_int(f)
+    }
+
+    pub extern "C" fn __fixsfti(f: f32) -> i128 {
+        float_to_signed_int(f)
+    }
+
+    #[arm_aeabi_alias = __aeabi_d2iz]
+    pub extern "C" fn __fixdfsi(f: f64) -> i32 {
+        float_to_signed_int(f)
+    }
+
+    #[arm_aeabi_alias = __aeabi_d2lz]
+    pub extern "C" fn __fixdfdi(f: f64) -> i64 {
+        float_to_signed_int(f)
+    }
+
+    pub extern "C" fn __fixdfti(f: f64) -> i128 {
+        float_to_signed_int(f)
+    }
+
+    #[ppc_alias = __fixkfsi]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __fixtfsi(f: f128) -> i32 {
+        float_to_signed_int(f)
+    }
+
+    #[ppc_alias = __fixkfdi]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __fixtfdi(f: f128) -> i64 {
+        float_to_signed_int(f)
+    }
+
+    #[ppc_alias = __fixkfti]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __fixtfti(f: f128) -> i128 {
+        float_to_signed_int(f)
+    }
+}
--- a/library/compiler-builtins/compiler-builtins/src/float/div.rs
+++ b/library/compiler-builtins/compiler-builtins/src/float/div.rs
@ -0,0 +1,635 @@
+//! Floating point division routines.
+//!
+//! This module documentation gives an overview of the method used. More documentation is inline.
+//!
+//! # Relevant notation
+//!
+//! - `m_a`: the mantissa of `a`, in base 2
+//! - `p_a`: the exponent of `a`, in base 2. I.e. `a = m_a * 2^p_a`
+//! - `uqN` (e.g. `uq1`): this refers to Q notation for fixed-point numbers. UQ1.31 is an unsigned
+//!   fixed-point number with 1 integral bit, and 31 decimal bits. A `uqN` variable of type `uM`
+//!   will have N bits of integer and M-N bits of fraction.
+//! - `hw`: half width, i.e. for `f64` this will be a `u32`.
+//! - `x` is the best estimate of `1/m_b`
+//!
+//! # Method Overview
+//!
+//! Division routines must solve for `a / b`, which is `res = m_a*2^p_a / m_b*2^p_b`. The basic
+//! process is as follows:
+//!
+//! - Rearange the exponent and significand to simplify the operations:
+//!   `res = (m_a / m_b) * 2^{p_a - p_b}`.
+//! - Check for early exits (infinity, zero, etc).
+//! - If `a` or `b` are subnormal, normalize by shifting the mantissa and adjusting the exponent.
+//! - Set the implicit bit so math is correct.
+//! - Shift mantissa significant digits (with implicit bit) fully left such that fixed-point UQ1
+//!   or UQ0 numbers can be used for mantissa math. These will have greater precision than the
+//!   actual mantissa, which is important for correct rounding.
+//! - Calculate the reciprocal of `m_b`, `x`.
+//! - Use the reciprocal to multiply rather than divide: `res = m_a * x_b * 2^{p_a - p_b}`.
+//! - Reapply rounding.
+//!
+//! # Reciprocal calculation
+//!
+//! Calculating the reciprocal is the most complicated part of this process. It uses the
+//! [Newton-Raphson method], which picks an initial estimation (of the reciprocal) and performs
+//! a number of iterations to increase its precision.
+//!
+//! In general, Newton's method takes the following form:
+//!
+//! ```text
+//! `x_n` is a guess or the result of a previous iteration. Increasing `n` converges to the
+//! desired result.
+//!
+//! The result approaches a zero of `f(x)` by applying a correction to the previous gues.
+//!
+//! x_{n+1} = x_n - f(x_n) / f'(x_n)
+//! ```
+//!
+//! Applying this to find the reciprocal:
+//!
+//! ```text
+//! 1 / x = b
+//!
+//! Rearrange so we can solve by finding a zero
+//! 0 = (1 / x) - b = f(x)
+//!
+//! f'(x) = -x^{-2}
+//!
+//! x_{n+1} = 2*x_n - b*x_n^2
+//! ```
+//!
+//! This is a process that can be repeated to calculate the reciprocal with enough precision to
+//! achieve a correctly rounded result for the overall division operation. The maximum required
+//! number of iterations is known since precision doubles with each iteration.
+//!
+//! # Half-width operations
+//!
+//! Calculating the reciprocal requires widening multiplication and performing arithmetic on the
+//! results, meaning that emulated integer arithmetic on `u128` (for `f64`) and `u256` (for `f128`)
+//! gets used instead of native math.
+//!
+//! To make this more efficient, all but the final operation can be computed using half-width
+//! integers. For example, rather than computing four iterations using 128-bit integers for `f64`,
+//! we can instead perform three iterations using native 64-bit integers and only one final
+//! iteration using the full 128 bits.
+//!
+//! This works because of precision doubling. Some leeway is allowed here because the fixed-point
+//! number has more bits than the final mantissa will.
+//!
+//! [Newton-Raphson method]: https://en.wikipedia.org/wiki/Newton%27s_method
+
+use core::mem::size_of;
+use core::ops;
+
+use super::HalfRep;
+use crate::float::Float;
+use crate::int::{CastFrom, CastInto, DInt, HInt, Int, MinInt};
+
+fn div<F: Float>(a: F, b: F) -> F
+where
+    F::Int: CastInto<i32>,
+    F::Int: From<HalfRep<F>>,
+    F::Int: From<u8>,
+    F::Int: HInt + DInt,
+    <F::Int as HInt>::D: ops::Shr<u32, Output = <F::Int as HInt>::D>,
+    F::Int: From<u32>,
+    u16: CastInto<F::Int>,
+    i32: CastInto<F::Int>,
+    u32: CastInto<F::Int>,
+    u128: CastInto<HalfRep<F>>,
+{
+    let one = F::Int::ONE;
+    let zero = F::Int::ZERO;
+    let one_hw = HalfRep::<F>::ONE;
+    let zero_hw = HalfRep::<F>::ZERO;
+    let hw = F::BITS / 2;
+    let lo_mask = F::Int::MAX >> hw;
+
+    let significand_bits = F::SIG_BITS;
+    // Saturated exponent, representing infinity
+    let exponent_sat: F::Int = F::EXP_SAT.cast();
+
+    let exponent_bias = F::EXP_BIAS;
+    let implicit_bit = F::IMPLICIT_BIT;
+    let significand_mask = F::SIG_MASK;
+    let sign_bit = F::SIGN_MASK;
+    let abs_mask = sign_bit - one;
+    let exponent_mask = F::EXP_MASK;
+    let inf_rep = exponent_mask;
+    let quiet_bit = implicit_bit >> 1;
+    let qnan_rep = exponent_mask | quiet_bit;
+    let (mut half_iterations, full_iterations) = get_iterations::<F>();
+    let recip_precision = reciprocal_precision::<F>();
+
+    if F::BITS == 128 {
+        // FIXME(tgross35): f128 seems to require one more half iteration than expected
+        half_iterations += 1;
+    }
+
+    let a_rep = a.to_bits();
+    let b_rep = b.to_bits();
+
+    // Exponent numeric representationm not accounting for bias
+    let a_exponent = (a_rep >> significand_bits) & exponent_sat;
+    let b_exponent = (b_rep >> significand_bits) & exponent_sat;
+    let quotient_sign = (a_rep ^ b_rep) & sign_bit;
+
+    let mut a_significand = a_rep & significand_mask;
+    let mut b_significand = b_rep & significand_mask;
+
+    // The exponent of our final result in its encoded form
+    let mut res_exponent: i32 =
+        i32::cast_from(a_exponent) - i32::cast_from(b_exponent) + (exponent_bias as i32);
+
+    // Detect if a or b is zero, denormal, infinity, or NaN.
+    if a_exponent.wrapping_sub(one) >= (exponent_sat - one)
+        || b_exponent.wrapping_sub(one) >= (exponent_sat - one)
+    {
+        let a_abs = a_rep & abs_mask;
+        let b_abs = b_rep & abs_mask;
+
+        // NaN / anything = qNaN
+        if a_abs > inf_rep {
+            return F::from_bits(a_rep | quiet_bit);
+        }
+
+        // anything / NaN = qNaN
+        if b_abs > inf_rep {
+            return F::from_bits(b_rep | quiet_bit);
+        }
+
+        if a_abs == inf_rep {
+            if b_abs == inf_rep {
+                // infinity / infinity = NaN
+                return F::from_bits(qnan_rep);
+            } else {
+                // infinity / anything else = +/- infinity
+                return F::from_bits(a_abs | quotient_sign);
+            }
+        }
+
+        // anything else / infinity = +/- 0
+        if b_abs == inf_rep {
+            return F::from_bits(quotient_sign);
+        }
+
+        if a_abs == zero {
+            if b_abs == zero {
+                // zero / zero = NaN
+                return F::from_bits(qnan_rep);
+            } else {
+                // zero / anything else = +/- zero
+                return F::from_bits(quotient_sign);
+            }
+        }
+
+        // anything else / zero = +/- infinity
+        if b_abs == zero {
+            return F::from_bits(inf_rep | quotient_sign);
+        }
+
+        // a is denormal. Renormalize it and set the scale to include the necessary exponent
+        // adjustment.
+        if a_abs < implicit_bit {
+            let (exponent, significand) = F::normalize(a_significand);
+            res_exponent += exponent;
+            a_significand = significand;
+        }
+
+        // b is denormal. Renormalize it and set the scale to include the necessary exponent
+        // adjustment.
+        if b_abs < implicit_bit {
+            let (exponent, significand) = F::normalize(b_significand);
+            res_exponent -= exponent;
+            b_significand = significand;
+        }
+    }
+
+    // Set the implicit significand bit. If we fell through from the
+    // denormal path it was already set by normalize( ), but setting it twice
+    // won't hurt anything.
+    a_significand |= implicit_bit;
+    b_significand |= implicit_bit;
+
+    // Transform to a fixed-point representation by shifting the significand to the high bits. We
+    // know this is in the range [1.0, 2.0] since the implicit bit is set to 1 above.
+    let b_uq1 = b_significand << (F::BITS - significand_bits - 1);
+
+    // Align the significand of b as a UQ1.(n-1) fixed-point number in the range
+    // [1.0, 2.0) and get a UQ0.n approximate reciprocal using a small minimax
+    // polynomial approximation: x0 = 3/4 + 1/sqrt(2) - b/2.
+    // The max error for this approximation is achieved at endpoints, so
+    //   abs(x0(b) - 1/b) <= abs(x0(1) - 1/1) = 3/4 - 1/sqrt(2) = 0.04289...,
+    // which is about 4.5 bits.
+    // The initial approximation is between x0(1.0) = 0.9571... and x0(2.0) = 0.4571...
+    //
+    // Then, refine the reciprocal estimate using a quadratically converging
+    // Newton-Raphson iteration:
+    //     x_{n+1} = x_n * (2 - x_n * b)
+    //
+    // Let b be the original divisor considered "in infinite precision" and
+    // obtained from IEEE754 representation of function argument (with the
+    // implicit bit set). Corresponds to rep_t-sized b_UQ1 represented in
+    // UQ1.(W-1).
+    //
+    // Let b_hw be an infinitely precise number obtained from the highest (HW-1)
+    // bits of divisor significand (with the implicit bit set). Corresponds to
+    // half_rep_t-sized b_UQ1_hw represented in UQ1.(HW-1) that is a **truncated**
+    // version of b_UQ1.
+    //
+    // Let e_n := x_n - 1/b_hw
+    //     E_n := x_n - 1/b
+    // abs(E_n) <= abs(e_n) + (1/b_hw - 1/b)
+    //           = abs(e_n) + (b - b_hw) / (b*b_hw)
+    //          <= abs(e_n) + 2 * 2^-HW
+    //
+    // rep_t-sized iterations may be slower than the corresponding half-width
+    // variant depending on the handware and whether single/double/quad precision
+    // is selected.
+    //
+    // NB: Using half-width iterations increases computation errors due to
+    // rounding, so error estimations have to be computed taking the selected
+    // mode into account!
+    let mut x_uq0 = if half_iterations > 0 {
+        // Starting with (n-1) half-width iterations
+        let b_uq1_hw: HalfRep<F> = b_uq1.hi();
+
+        // C is (3/4 + 1/sqrt(2)) - 1 truncated to W0 fractional bits as UQ0.HW
+        // with W0 being either 16 or 32 and W0 <= HW.
+        // That is, C is the aforementioned 3/4 + 1/sqrt(2) constant (from which
+        // b/2 is subtracted to obtain x0) wrapped to [0, 1) range.
+        let c_hw = c_hw::<F>();
+
+        // Check that the top bit is set, i.e. value is within `[1, 2)`.
+        debug_assert!(b_uq1_hw & (one_hw << (HalfRep::<F>::BITS - 1)) > zero_hw);
+
+        // b >= 1, thus an upper bound for 3/4 + 1/sqrt(2) - b/2 is about 0.9572,
+        // so x0 fits to UQ0.HW without wrapping.
+        let mut x_uq0_hw: HalfRep<F> =
+            c_hw.wrapping_sub(b_uq1_hw /* exact b_hw/2 as UQ0.HW */);
+
+        // An e_0 error is comprised of errors due to
+        // * x0 being an inherently imprecise first approximation of 1/b_hw
+        // * C_hw being some (irrational) number **truncated** to W0 bits
+        // Please note that e_0 is calculated against the infinitely precise
+        // reciprocal of b_hw (that is, **truncated** version of b).
+        //
+        // e_0 <= 3/4 - 1/sqrt(2) + 2^-W0
+        //
+        // By construction, 1 <= b < 2
+        // f(x)  = x * (2 - b*x) = 2*x - b*x^2
+        // f'(x) = 2 * (1 - b*x)
+        //
+        // On the [0, 1] interval, f(0)   = 0,
+        // then it increses until  f(1/b) = 1 / b, maximum on (0, 1),
+        // then it decreses to     f(1)   = 2 - b
+        //
+        // Let g(x) = x - f(x) = b*x^2 - x.
+        // On (0, 1/b), g(x) < 0 <=> f(x) > x
+        // On (1/b, 1], g(x) > 0 <=> f(x) < x
+        //
+        // For half-width iterations, b_hw is used instead of b.
+        for _ in 0..half_iterations {
+            // corr_UQ1_hw can be **larger** than 2 - b_hw*x by at most 1*Ulp
+            // of corr_UQ1_hw.
+            // "0.0 - (...)" is equivalent to "2.0 - (...)" in UQ1.(HW-1).
+            // On the other hand, corr_UQ1_hw should not overflow from 2.0 to 0.0 provided
+            // no overflow occurred earlier: ((rep_t)x_UQ0_hw * b_UQ1_hw >> HW) is
+            // expected to be strictly positive because b_UQ1_hw has its highest bit set
+            // and x_UQ0_hw should be rather large (it converges to 1/2 < 1/b_hw <= 1).
+            //
+            // Now, we should multiply UQ0.HW and UQ1.(HW-1) numbers, naturally
+            // obtaining an UQ1.(HW-1) number and proving its highest bit could be
+            // considered to be 0 to be able to represent it in UQ0.HW.
+            // From the above analysis of f(x), if corr_UQ1_hw would be represented
+            // without any intermediate loss of precision (that is, in twice_rep_t)
+            // x_UQ0_hw could be at most [1.]000... if b_hw is exactly 1.0 and strictly
+            // less otherwise. On the other hand, to obtain [1.]000..., one have to pass
+            // 1/b_hw == 1.0 to f(x), so this cannot occur at all without overflow (due
+            // to 1.0 being not representable as UQ0.HW).
+            // The fact corr_UQ1_hw was virtually round up (due to result of
+            // multiplication being **first** truncated, then negated - to improve
+            // error estimations) can increase x_UQ0_hw by up to 2*Ulp of x_UQ0_hw.
+            //
+            // Now, either no overflow occurred or x_UQ0_hw is 0 or 1 in its half_rep_t
+            // representation. In the latter case, x_UQ0_hw will be either 0 or 1 after
+            // any number of iterations, so just subtract 2 from the reciprocal
+            // approximation after last iteration.
+            //
+            // In infinite precision, with 0 <= eps1, eps2 <= U = 2^-HW:
+            // corr_UQ1_hw = 2 - (1/b_hw + e_n) * b_hw + 2*eps1
+            //             = 1 - e_n * b_hw + 2*eps1
+            // x_UQ0_hw = (1/b_hw + e_n) * (1 - e_n*b_hw + 2*eps1) - eps2
+            //          = 1/b_hw - e_n + 2*eps1/b_hw + e_n - e_n^2*b_hw + 2*e_n*eps1 - eps2
+            //          = 1/b_hw + 2*eps1/b_hw - e_n^2*b_hw + 2*e_n*eps1 - eps2
+            // e_{n+1} = -e_n^2*b_hw + 2*eps1/b_hw + 2*e_n*eps1 - eps2
+            //         = 2*e_n*eps1 - (e_n^2*b_hw + eps2) + 2*eps1/b_hw
+            //                        \------ >0 -------/   \-- >0 ---/
+            // abs(e_{n+1}) <= 2*abs(e_n)*U + max(2*e_n^2 + U, 2 * U)
+            x_uq0_hw = next_guess(x_uq0_hw, b_uq1_hw);
+        }
+
+        // For initial half-width iterations, U = 2^-HW
+        // Let  abs(e_n)     <= u_n * U,
+        // then abs(e_{n+1}) <= 2 * u_n * U^2 + max(2 * u_n^2 * U^2 + U, 2 * U)
+        // u_{n+1} <= 2 * u_n * U + max(2 * u_n^2 * U + 1, 2)
+        //
+        // Account for possible overflow (see above). For an overflow to occur for the
+        // first time, for "ideal" corr_UQ1_hw (that is, without intermediate
+        // truncation), the result of x_UQ0_hw * corr_UQ1_hw should be either maximum
+        // value representable in UQ0.HW or less by 1. This means that 1/b_hw have to
+        // be not below that value (see g(x) above), so it is safe to decrement just
+        // once after the final iteration. On the other hand, an effective value of
+        // divisor changes after this point (from b_hw to b), so adjust here.
+        x_uq0_hw = x_uq0_hw.wrapping_sub(one_hw);
+
+        // Error estimations for full-precision iterations are calculated just
+        // as above, but with U := 2^-W and taking extra decrementing into account.
+        // We need at least one such iteration.
+        //
+        // Simulating operations on a twice_rep_t to perform a single final full-width
+        // iteration. Using ad-hoc multiplication implementations to take advantage
+        // of particular structure of operands.
+        let blo: F::Int = b_uq1 & lo_mask;
+
+        // x_UQ0 = x_UQ0_hw * 2^HW - 1
+        // x_UQ0 * b_UQ1 = (x_UQ0_hw * 2^HW) * (b_UQ1_hw * 2^HW + blo) - b_UQ1
+        //
+        //   <--- higher half ---><--- lower half --->
+        //   [x_UQ0_hw * b_UQ1_hw]
+        // +            [  x_UQ0_hw *  blo  ]
+        // -                      [      b_UQ1       ]
+        // = [      result       ][.... discarded ...]
+        let corr_uq1: F::Int = (F::Int::from(x_uq0_hw) * F::Int::from(b_uq1_hw)
+            + ((F::Int::from(x_uq0_hw) * blo) >> hw))
+            .wrapping_sub(one)
+            .wrapping_neg(); // account for *possible* carry
+
+        let lo_corr: F::Int = corr_uq1 & lo_mask;
+        let hi_corr: F::Int = corr_uq1 >> hw;
+
+        // x_UQ0 * corr_UQ1 = (x_UQ0_hw * 2^HW) * (hi_corr * 2^HW + lo_corr) - corr_UQ1
+        let mut x_uq0: F::Int = ((F::Int::from(x_uq0_hw) * hi_corr) << 1)
+            .wrapping_add((F::Int::from(x_uq0_hw) * lo_corr) >> (hw - 1))
+            // 1 to account for the highest bit of corr_UQ1 can be 1
+            // 1 to account for possible carry
+            // Just like the case of half-width iterations but with possibility
+            // of overflowing by one extra Ulp of x_UQ0.
+            .wrapping_sub(F::Int::from(2u8));
+
+        x_uq0 -= one;
+        // ... and then traditional fixup by 2 should work
+
+        // On error estimation:
+        // abs(E_{N-1}) <=   (u_{N-1} + 2 /* due to conversion e_n -> E_n */) * 2^-HW
+        //                 + (2^-HW + 2^-W))
+        // abs(E_{N-1}) <= (u_{N-1} + 3.01) * 2^-HW
+        //
+        // Then like for the half-width iterations:
+        // With 0 <= eps1, eps2 < 2^-W
+        // E_N  = 4 * E_{N-1} * eps1 - (E_{N-1}^2 * b + 4 * eps2) + 4 * eps1 / b
+        // abs(E_N) <= 2^-W * [ 4 * abs(E_{N-1}) + max(2 * abs(E_{N-1})^2 * 2^W + 4, 8)) ]
+        // abs(E_N) <= 2^-W * [ 4 * (u_{N-1} + 3.01) * 2^-HW + max(4 + 2 * (u_{N-1} + 3.01)^2, 8) ]
+        x_uq0
+    } else {
+        // C is (3/4 + 1/sqrt(2)) - 1 truncated to 64 fractional bits as UQ0.n
+        let c: F::Int = F::Int::from(0x7504F333u32) << (F::BITS - 32);
+        let mut x_uq0: F::Int = c.wrapping_sub(b_uq1);
+
+        // E_0 <= 3/4 - 1/sqrt(2) + 2 * 2^-64
+        // x_uq0
+        for _ in 0..full_iterations {
+            x_uq0 = next_guess(x_uq0, b_uq1);
+        }
+
+        x_uq0
+    };
+
+    // Finally, account for possible overflow, as explained above.
+    x_uq0 = x_uq0.wrapping_sub(2.cast());
+
+    // Suppose 1/b - P * 2^-W < x < 1/b + P * 2^-W
+    x_uq0 -= recip_precision.cast();
+
+    // Now 1/b - (2*P) * 2^-W < x < 1/b
+    // FIXME Is x_UQ0 still >= 0.5?
+
+    let mut quotient_uq1: F::Int = x_uq0.widen_mul(a_significand << 1).hi();
+    // Now, a/b - 4*P * 2^-W < q < a/b for q=<quotient_UQ1:dummy> in UQ1.(SB+1+W).
+
+    // quotient_UQ1 is in [0.5, 2.0) as UQ1.(SB+1),
+    // adjust it to be in [1.0, 2.0) as UQ1.SB.
+    let mut residual_lo = if quotient_uq1 < (implicit_bit << 1) {
+        // Highest bit is 0, so just reinterpret quotient_UQ1 as UQ1.SB,
+        // effectively doubling its value as well as its error estimation.
+        let residual_lo = (a_significand << (significand_bits + 1))
+            .wrapping_sub(quotient_uq1.wrapping_mul(b_significand));
+        res_exponent -= 1;
+        a_significand <<= 1;
+        residual_lo
+    } else {
+        // Highest bit is 1 (the UQ1.(SB+1) value is in [1, 2)), convert it
+        // to UQ1.SB by right shifting by 1. Least significant bit is omitted.
+        quotient_uq1 >>= 1;
+        (a_significand << significand_bits).wrapping_sub(quotient_uq1.wrapping_mul(b_significand))
+    };
+
+    // drop mutability
+    let quotient = quotient_uq1;
+
+    // NB: residualLo is calculated above for the normal result case.
+    //     It is re-computed on denormal path that is expected to be not so
+    //     performance-sensitive.
+    //
+    // Now, q cannot be greater than a/b and can differ by at most 8*P * 2^-W + 2^-SB
+    // Each NextAfter() increments the floating point value by at least 2^-SB
+    // (more, if exponent was incremented).
+    // Different cases (<---> is of 2^-SB length, * = a/b that is shown as a midpoint):
+    //   q
+    //   |   | * |   |   |       |       |
+    //       <--->      2^t
+    //   |   |   |   |   |   *   |       |
+    //               q
+    // To require at most one NextAfter(), an error should be less than 1.5 * 2^-SB.
+    //   (8*P) * 2^-W + 2^-SB < 1.5 * 2^-SB
+    //   (8*P) * 2^-W         < 0.5 * 2^-SB
+    //   P < 2^(W-4-SB)
+    // Generally, for at most R NextAfter() to be enough,
+    //   P < (2*R - 1) * 2^(W-4-SB)
+    // For f32 (0+3): 10 < 32 (OK)
+    // For f32 (2+1): 32 < 74 < 32 * 3, so two NextAfter() are required
+    // For f64: 220 < 256 (OK)
+    // For f128: 4096 * 3 < 13922 < 4096 * 5 (three NextAfter() are required)
+    //
+    // If we have overflowed the exponent, return infinity
+    if res_exponent >= i32::cast_from(exponent_sat) {
+        return F::from_bits(inf_rep | quotient_sign);
+    }
+
+    // Now, quotient <= the correctly-rounded result
+    // and may need taking NextAfter() up to 3 times (see error estimates above)
+    // r = a - b * q
+    let mut abs_result = if res_exponent > 0 {
+        let mut ret = quotient & significand_mask;
+        ret |= F::Int::from(res_exponent as u32) << significand_bits;
+        residual_lo <<= 1;
+        ret
+    } else {
+        if ((significand_bits as i32) + res_exponent) < 0 {
+            return F::from_bits(quotient_sign);
+        }
+
+        let ret = quotient.wrapping_shr(u32::cast_from(res_exponent.wrapping_neg()) + 1);
+        residual_lo = a_significand
+            .wrapping_shl(significand_bits.wrapping_add(CastInto::<u32>::cast(res_exponent)))
+            .wrapping_sub(ret.wrapping_mul(b_significand) << 1);
+        ret
+    };
+
+    residual_lo += abs_result & one; // tie to even
+    // conditionally turns the below LT comparison into LTE
+    abs_result += u8::from(residual_lo > b_significand).into();
+
+    if F::BITS == 128 || (F::BITS == 32 && half_iterations > 0) {
+        // Do not round Infinity to NaN
+        abs_result +=
+            u8::from(abs_result < inf_rep && residual_lo > (2 + 1).cast() * b_significand).into();
+    }
+
+    if F::BITS == 128 {
+        abs_result +=
+            u8::from(abs_result < inf_rep && residual_lo > (4 + 1).cast() * b_significand).into();
+    }
+
+    F::from_bits(abs_result | quotient_sign)
+}
+
+/// Calculate the number of iterations required for a float type's precision.
+///
+/// This returns `(h, f)` where `h` is the number of iterations to be done using integers at half
+/// the float's bit width, and `f` is the number of iterations done using integers of the float's
+/// full width. This is further explained in the module documentation.
+///
+/// # Requirements
+///
+/// The initial estimate should have at least 8 bits of precision. If this is not true, results
+/// will be inaccurate.
+const fn get_iterations<F: Float>() -> (usize, usize) {
+    // Precision doubles with each iteration. Assume we start with 8 bits of precision.
+    let total_iterations = F::BITS.ilog2() as usize - 2;
+
+    if 2 * size_of::<F>() <= size_of::<*const ()>() {
+        // If widening multiplication will be efficient (uses word-sized integers), there is no
+        // reason to use half-sized iterations.
+        (0, total_iterations)
+    } else {
+        // Otherwise, do as many iterations as possible at half width.
+        (total_iterations - 1, 1)
+    }
+}
+
+/// `u_n` for different precisions (with N-1 half-width iterations).
+///
+/// W0 is the precision of C
+///   u_0 = (3/4 - 1/sqrt(2) + 2^-W0) * 2^HW
+///
+/// Estimated with bc:
+///
+/// ```text
+///   define half1(un) { return 2.0 * (un + un^2) / 2.0^hw + 1.0; }
+///   define half2(un) { return 2.0 * un / 2.0^hw + 2.0; }
+///   define full1(un) { return 4.0 * (un + 3.01) / 2.0^hw + 2.0 * (un + 3.01)^2 + 4.0; }
+///   define full2(un) { return 4.0 * (un + 3.01) / 2.0^hw + 8.0; }
+///
+///             | f32 (0 + 3) | f32 (2 + 1)  | f64 (3 + 1)  | f128 (4 + 1)
+/// u_0         | < 184224974 | < 2812.1     | < 184224974  | < 791240234244348797
+/// u_1         | < 15804007  | < 242.7      | < 15804007   | < 67877681371350440
+/// u_2         | < 116308    | < 2.81       | < 116308     | < 499533100252317
+/// u_3         | < 7.31      |              | < 7.31       | < 27054456580
+/// u_4         |             |              |              | < 80.4
+/// Final (U_N) | same as u_3 | < 72         | < 218        | < 13920
+/// ````
+///
+/// Add 2 to `U_N` due to final decrement.
+const fn reciprocal_precision<F: Float>() -> u16 {
+    let (half_iterations, full_iterations) = get_iterations::<F>();
+
+    if full_iterations < 1 {
+        panic!("Must have at least one full iteration");
+    }
+
+    // FIXME(tgross35): calculate this programmatically
+    if F::BITS == 32 && half_iterations == 2 && full_iterations == 1 {
+        74u16
+    } else if F::BITS == 32 && half_iterations == 0 && full_iterations == 3 {
+        10
+    } else if F::BITS == 64 && half_iterations == 3 && full_iterations == 1 {
+        220
+    } else if F::BITS == 128 && half_iterations == 4 && full_iterations == 1 {
+        13922
+    } else {
+        panic!("Invalid number of iterations")
+    }
+}
+
+/// The value of `C` adjusted to half width.
+///
+/// C is (3/4 + 1/sqrt(2)) - 1 truncated to W0 fractional bits as UQ0.HW with W0 being either
+/// 16 or 32 and W0 <= HW. That is, C is the aforementioned 3/4 + 1/sqrt(2) constant (from
+/// which b/2 is subtracted to obtain x0) wrapped to [0, 1) range.
+fn c_hw<F: Float>() -> HalfRep<F>
+where
+    F::Int: DInt,
+    u128: CastInto<HalfRep<F>>,
+{
+    const C_U128: u128 = 0x7504f333f9de6108b2fb1366eaa6a542;
+    const { C_U128 >> (u128::BITS - <HalfRep<F>>::BITS) }.cast()
+}
+
+/// Perform one iteration at any width to approach `1/b`, given previous guess `x`. Returns
+/// the next `x` as a UQ0 number.
+///
+/// This is the `x_{n+1} = 2*x_n - b*x_n^2` algorithm, implemented as `x_n * (2 - b*x_n)`. It
+/// uses widening multiplication to calculate the result with necessary precision.
+fn next_guess<I>(x_uq0: I, b_uq1: I) -> I
+where
+    I: Int + HInt,
+    <I as HInt>::D: ops::Shr<u32, Output = <I as HInt>::D>,
+{
+    // `corr = 2 - b*x_n`
+    //
+    // This looks like `0 - b*x_n`. However, this works - in `UQ1`, `0.0 - x = 2.0 - x`.
+    let corr_uq1: I = I::ZERO.wrapping_sub(x_uq0.widen_mul(b_uq1).hi());
+
+    // `x_n * corr = x_n * (2 - b*x_n)`
+    (x_uq0.widen_mul(corr_uq1) >> (I::BITS - 1)).lo()
+}
+
+intrinsics! {
+    #[arm_aeabi_alias = __aeabi_fdiv]
+    pub extern "C" fn __divsf3(a: f32, b: f32) -> f32 {
+        div(a, b)
+    }
+
+    #[arm_aeabi_alias = __aeabi_ddiv]
+    pub extern "C" fn __divdf3(a: f64, b: f64) -> f64 {
+        div(a, b)
+    }
+
+    #[ppc_alias = __divkf3]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __divtf3(a: f128, b: f128) -> f128 {
+        div(a, b)
+    }
+
+    #[cfg(target_arch = "arm")]
+    pub extern "C" fn __divsf3vfp(a: f32, b: f32) -> f32 {
+        a / b
+    }
+
+    #[cfg(target_arch = "arm")]
+    pub extern "C" fn __divdf3vfp(a: f64, b: f64) -> f64 {
+        a / b
+    }
+}
--- a/library/compiler-builtins/compiler-builtins/src/float/extend.rs
+++ b/library/compiler-builtins/compiler-builtins/src/float/extend.rs
@ -0,0 +1,123 @@
+use crate::float::Float;
+use crate::int::{CastInto, Int, MinInt};
+
+/// Generic conversion from a narrower to a wider IEEE-754 floating-point type
+fn extend<F: Float, R: Float>(a: F) -> R
+where
+    F::Int: CastInto<u64>,
+    u64: CastInto<F::Int>,
+    u32: CastInto<R::Int>,
+    R::Int: CastInto<u32>,
+    R::Int: CastInto<u64>,
+    u64: CastInto<R::Int>,
+    F::Int: CastInto<R::Int>,
+{
+    let src_zero = F::Int::ZERO;
+    let src_one = F::Int::ONE;
+    let src_bits = F::BITS;
+    let src_sig_bits = F::SIG_BITS;
+    let src_exp_bias = F::EXP_BIAS;
+    let src_min_normal = F::IMPLICIT_BIT;
+    let src_infinity = F::EXP_MASK;
+    let src_sign_mask = F::SIGN_MASK;
+    let src_abs_mask = src_sign_mask - src_one;
+    let src_qnan = F::SIG_MASK;
+    let src_nan_code = src_qnan - src_one;
+
+    let dst_bits = R::BITS;
+    let dst_sig_bits = R::SIG_BITS;
+    let dst_inf_exp = R::EXP_SAT;
+    let dst_exp_bias = R::EXP_BIAS;
+    let dst_min_normal = R::IMPLICIT_BIT;
+
+    let sig_bits_delta = dst_sig_bits - src_sig_bits;
+    let exp_bias_delta = dst_exp_bias - src_exp_bias;
+    let a_abs = a.to_bits() & src_abs_mask;
+    let mut abs_result = R::Int::ZERO;
+
+    if a_abs.wrapping_sub(src_min_normal) < src_infinity.wrapping_sub(src_min_normal) {
+        // a is a normal number.
+        // Extend to the destination type by shifting the significand and
+        // exponent into the proper position and rebiasing the exponent.
+        let abs_dst: R::Int = a_abs.cast();
+        let bias_dst: R::Int = exp_bias_delta.cast();
+        abs_result = abs_dst.wrapping_shl(sig_bits_delta);
+        abs_result += bias_dst.wrapping_shl(dst_sig_bits);
+    } else if a_abs >= src_infinity {
+        // a is NaN or infinity.
+        // Conjure the result by beginning with infinity, then setting the qNaN
+        // bit (if needed) and right-aligning the rest of the trailing NaN
+        // payload field.
+        let qnan_dst: R::Int = (a_abs & src_qnan).cast();
+        let nan_code_dst: R::Int = (a_abs & src_nan_code).cast();
+        let inf_exp_dst: R::Int = dst_inf_exp.cast();
+        abs_result = inf_exp_dst.wrapping_shl(dst_sig_bits);
+        abs_result |= qnan_dst.wrapping_shl(sig_bits_delta);
+        abs_result |= nan_code_dst.wrapping_shl(sig_bits_delta);
+    } else if a_abs != src_zero {
+        // a is denormal.
+        // Renormalize the significand and clear the leading bit, then insert
+        // the correct adjusted exponent in the destination type.
+        let scale = a_abs.leading_zeros() - src_min_normal.leading_zeros();
+        let abs_dst: R::Int = a_abs.cast();
+        let bias_dst: R::Int = (exp_bias_delta - scale + 1).cast();
+        abs_result = abs_dst.wrapping_shl(sig_bits_delta + scale);
+        abs_result = (abs_result ^ dst_min_normal) | (bias_dst.wrapping_shl(dst_sig_bits));
+    }
+
+    let sign_result: R::Int = (a.to_bits() & src_sign_mask).cast();
+    R::from_bits(abs_result | (sign_result.wrapping_shl(dst_bits - src_bits)))
+}
+
+intrinsics! {
+    #[aapcs_on_arm]
+    #[arm_aeabi_alias = __aeabi_f2d]
+    pub extern "C" fn  __extendsfdf2(a: f32) -> f64 {
+        extend(a)
+    }
+}
+
+intrinsics! {
+    #[aapcs_on_arm]
+    #[apple_f16_arg_abi]
+    #[arm_aeabi_alias = __aeabi_h2f]
+    #[cfg(f16_enabled)]
+    pub extern "C" fn __extendhfsf2(a: f16) -> f32 {
+        extend(a)
+    }
+
+    #[aapcs_on_arm]
+    #[apple_f16_arg_abi]
+    #[cfg(f16_enabled)]
+    pub extern "C" fn __gnu_h2f_ieee(a: f16) -> f32 {
+        extend(a)
+    }
+
+    #[aapcs_on_arm]
+    #[apple_f16_arg_abi]
+    #[cfg(f16_enabled)]
+    pub extern "C" fn __extendhfdf2(a: f16) -> f64 {
+        extend(a)
+    }
+
+    #[aapcs_on_arm]
+    #[ppc_alias = __extendhfkf2]
+    #[cfg(all(f16_enabled, f128_enabled))]
+    pub extern "C" fn __extendhftf2(a: f16) -> f128 {
+        extend(a)
+    }
+
+    #[aapcs_on_arm]
+    #[ppc_alias = __extendsfkf2]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __extendsftf2(a: f32) -> f128 {
+        extend(a)
+    }
+
+    #[aapcs_on_arm]
+    #[ppc_alias = __extenddfkf2]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __extenddftf2(a: f64) -> f128 {
+        extend(a)
+    }
+}
--- a/library/compiler-builtins/compiler-builtins/src/float/mod.rs
+++ b/library/compiler-builtins/compiler-builtins/src/float/mod.rs
@ -0,0 +1,15 @@
+pub mod add;
+pub mod cmp;
+pub mod conv;
+pub mod div;
+pub mod extend;
+pub mod mul;
+pub mod pow;
+pub mod sub;
+pub(crate) mod traits;
+pub mod trunc;
+
+#[cfg(not(feature = "unstable-public-internals"))]
+pub(crate) use traits::{Float, HalfRep};
+#[cfg(feature = "unstable-public-internals")]
+pub use traits::{Float, HalfRep};
--- a/library/compiler-builtins/compiler-builtins/src/float/mul.rs
+++ b/library/compiler-builtins/compiler-builtins/src/float/mul.rs
@ -0,0 +1,200 @@
+use crate::float::Float;
+use crate::int::{CastInto, DInt, HInt, Int, MinInt};
+
+fn mul<F: Float>(a: F, b: F) -> F
+where
+    u32: CastInto<F::Int>,
+    F::Int: CastInto<u32>,
+    i32: CastInto<F::Int>,
+    F::Int: CastInto<i32>,
+    F::Int: HInt,
+{
+    let one = F::Int::ONE;
+    let zero = F::Int::ZERO;
+
+    let bits = F::BITS;
+    let significand_bits = F::SIG_BITS;
+    let max_exponent = F::EXP_SAT;
+
+    let exponent_bias = F::EXP_BIAS;
+
+    let implicit_bit = F::IMPLICIT_BIT;
+    let significand_mask = F::SIG_MASK;
+    let sign_bit = F::SIGN_MASK;
+    let abs_mask = sign_bit - one;
+    let exponent_mask = F::EXP_MASK;
+    let inf_rep = exponent_mask;
+    let quiet_bit = implicit_bit >> 1;
+    let qnan_rep = exponent_mask | quiet_bit;
+    let exponent_bits = F::EXP_BITS;
+
+    let a_rep = a.to_bits();
+    let b_rep = b.to_bits();
+
+    let a_exponent = (a_rep >> significand_bits) & max_exponent.cast();
+    let b_exponent = (b_rep >> significand_bits) & max_exponent.cast();
+    let product_sign = (a_rep ^ b_rep) & sign_bit;
+
+    let mut a_significand = a_rep & significand_mask;
+    let mut b_significand = b_rep & significand_mask;
+    let mut scale = 0;
+
+    // Detect if a or b is zero, denormal, infinity, or NaN.
+    if a_exponent.wrapping_sub(one) >= (max_exponent - 1).cast()
+        || b_exponent.wrapping_sub(one) >= (max_exponent - 1).cast()
+    {
+        let a_abs = a_rep & abs_mask;
+        let b_abs = b_rep & abs_mask;
+
+        // NaN + anything = qNaN
+        if a_abs > inf_rep {
+            return F::from_bits(a_rep | quiet_bit);
+        }
+        // anything + NaN = qNaN
+        if b_abs > inf_rep {
+            return F::from_bits(b_rep | quiet_bit);
+        }
+
+        if a_abs == inf_rep {
+            if b_abs != zero {
+                // infinity * non-zero = +/- infinity
+                return F::from_bits(a_abs | product_sign);
+            } else {
+                // infinity * zero = NaN
+                return F::from_bits(qnan_rep);
+            }
+        }
+
+        if b_abs == inf_rep {
+            if a_abs != zero {
+                // infinity * non-zero = +/- infinity
+                return F::from_bits(b_abs | product_sign);
+            } else {
+                // infinity * zero = NaN
+                return F::from_bits(qnan_rep);
+            }
+        }
+
+        // zero * anything = +/- zero
+        if a_abs == zero {
+            return F::from_bits(product_sign);
+        }
+
+        // anything * zero = +/- zero
+        if b_abs == zero {
+            return F::from_bits(product_sign);
+        }
+
+        // one or both of a or b is denormal, the other (if applicable) is a
+        // normal number.  Renormalize one or both of a and b, and set scale to
+        // include the necessary exponent adjustment.
+        if a_abs < implicit_bit {
+            let (exponent, significand) = F::normalize(a_significand);
+            scale += exponent;
+            a_significand = significand;
+        }
+
+        if b_abs < implicit_bit {
+            let (exponent, significand) = F::normalize(b_significand);
+            scale += exponent;
+            b_significand = significand;
+        }
+    }
+
+    // Or in the implicit significand bit.  (If we fell through from the
+    // denormal path it was already set by normalize( ), but setting it twice
+    // won't hurt anything.)
+    a_significand |= implicit_bit;
+    b_significand |= implicit_bit;
+
+    // Get the significand of a*b.  Before multiplying the significands, shift
+    // one of them left to left-align it in the field.  Thus, the product will
+    // have (exponentBits + 2) integral digits, all but two of which must be
+    // zero.  Normalizing this result is just a conditional left-shift by one
+    // and bumping the exponent accordingly.
+    let (mut product_low, mut product_high) = a_significand
+        .widen_mul(b_significand << exponent_bits)
+        .lo_hi();
+
+    let a_exponent_i32: i32 = a_exponent.cast();
+    let b_exponent_i32: i32 = b_exponent.cast();
+    let mut product_exponent: i32 = a_exponent_i32
+        .wrapping_add(b_exponent_i32)
+        .wrapping_add(scale)
+        .wrapping_sub(exponent_bias as i32);
+
+    // Normalize the significand, adjust exponent if needed.
+    if (product_high & implicit_bit) != zero {
+        product_exponent = product_exponent.wrapping_add(1);
+    } else {
+        product_high = (product_high << 1) | (product_low >> (bits - 1));
+        product_low <<= 1;
+    }
+
+    // If we have overflowed the type, return +/- infinity.
+    if product_exponent >= max_exponent as i32 {
+        return F::from_bits(inf_rep | product_sign);
+    }
+
+    if product_exponent <= 0 {
+        // Result is denormal before rounding
+        //
+        // If the result is so small that it just underflows to zero, return
+        // a zero of the appropriate sign.  Mathematically there is no need to
+        // handle this case separately, but we make it a special case to
+        // simplify the shift logic.
+        let shift = one.wrapping_sub(product_exponent.cast()).cast();
+        if shift >= bits {
+            return F::from_bits(product_sign);
+        }
+
+        // Otherwise, shift the significand of the result so that the round
+        // bit is the high bit of `product_low`.
+        // Ensure one of the non-highest bits in `product_low` is set if the shifted out bit are
+        // not all zero so that the result is correctly rounded below.
+        let sticky = product_low << (bits - shift) != zero;
+        product_low =
+            (product_high << (bits - shift)) | (product_low >> shift) | (sticky as u32).cast();
+        product_high >>= shift;
+    } else {
+        // Result is normal before rounding; insert the exponent.
+        product_high &= significand_mask;
+        product_high |= product_exponent.cast() << significand_bits;
+    }
+
+    // Insert the sign of the result:
+    product_high |= product_sign;
+
+    // Final rounding.  The final result may overflow to infinity, or underflow
+    // to zero, but those are the correct results in those cases.  We use the
+    // default IEEE-754 round-to-nearest, ties-to-even rounding mode.
+    if product_low > sign_bit {
+        product_high += one;
+    }
+
+    if product_low == sign_bit {
+        product_high += product_high & one;
+    }
+
+    F::from_bits(product_high)
+}
+
+intrinsics! {
+    #[aapcs_on_arm]
+    #[arm_aeabi_alias = __aeabi_fmul]
+    pub extern "C" fn __mulsf3(a: f32, b: f32) -> f32 {
+        mul(a, b)
+    }
+
+    #[aapcs_on_arm]
+    #[arm_aeabi_alias = __aeabi_dmul]
+    pub extern "C" fn __muldf3(a: f64, b: f64) -> f64 {
+        mul(a, b)
+    }
+
+    #[ppc_alias = __mulkf3]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __multf3(a: f128, b: f128) -> f128 {
+        mul(a, b)
+    }
+}
--- a/library/compiler-builtins/compiler-builtins/src/float/pow.rs
+++ b/library/compiler-builtins/compiler-builtins/src/float/pow.rs
@ -0,0 +1,40 @@
+use crate::float::Float;
+use crate::int::Int;
+
+/// Returns `a` raised to the power `b`
+fn pow<F: Float>(a: F, b: i32) -> F {
+    let mut a = a;
+    let recip = b < 0;
+    let mut pow = Int::abs_diff(b, 0);
+    let mut mul = F::ONE;
+    loop {
+        if (pow & 1) != 0 {
+            mul *= a;
+        }
+        pow >>= 1;
+        if pow == 0 {
+            break;
+        }
+        a *= a;
+    }
+
+    if recip { F::ONE / mul } else { mul }
+}
+
+intrinsics! {
+    pub extern "C" fn __powisf2(a: f32, b: i32) -> f32 {
+        pow(a, b)
+    }
+
+    pub extern "C" fn __powidf2(a: f64, b: i32) -> f64 {
+        pow(a, b)
+    }
+
+    #[ppc_alias = __powikf2]
+    #[cfg(f128_enabled)]
+    // FIXME(f16_f128): MSVC cannot build these until `__divtf3` is available in nightly.
+    #[cfg(not(target_env = "msvc"))]
+    pub extern "C" fn __powitf2(a: f128, b: i32) -> f128 {
+        pow(a, b)
+    }
+}
--- a/library/compiler-builtins/compiler-builtins/src/float/sub.rs
+++ b/library/compiler-builtins/compiler-builtins/src/float/sub.rs
@ -0,0 +1,24 @@
+use crate::float::Float;
+
+intrinsics! {
+    #[arm_aeabi_alias = __aeabi_fsub]
+    pub extern "C" fn __subsf3(a: f32, b: f32) -> f32 {
+        crate::float::add::__addsf3(a, f32::from_bits(b.to_bits() ^ f32::SIGN_MASK))
+    }
+
+    #[arm_aeabi_alias = __aeabi_dsub]
+    pub extern "C" fn __subdf3(a: f64, b: f64) -> f64 {
+        crate::float::add::__adddf3(a, f64::from_bits(b.to_bits() ^ f64::SIGN_MASK))
+    }
+
+    #[ppc_alias = __subkf3]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __subtf3(a: f128, b: f128) -> f128 {
+        #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+        use crate::float::add::__addkf3 as __addtf3;
+        #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+        use crate::float::add::__addtf3;
+
+        __addtf3(a, f128::from_bits(b.to_bits() ^ f128::SIGN_MASK))
+    }
+}
--- a/library/compiler-builtins/compiler-builtins/src/float/traits.rs
+++ b/library/compiler-builtins/compiler-builtins/src/float/traits.rs
@ -0,0 +1,189 @@
+use core::ops;
+
+use crate::int::{DInt, Int, MinInt};
+
+/// Wrapper to extract the integer type half of the float's size
+pub type HalfRep<F> = <<F as Float>::Int as DInt>::H;
+
+/// Trait for some basic operations on floats
+#[allow(dead_code)]
+pub trait Float:
+    Copy
+    + core::fmt::Debug
+    + PartialEq
+    + PartialOrd
+    + ops::AddAssign
+    + ops::MulAssign
+    + ops::Add<Output = Self>
+    + ops::Sub<Output = Self>
+    + ops::Div<Output = Self>
+    + ops::Rem<Output = Self>
+{
+    /// A uint of the same width as the float
+    type Int: Int<OtherSign = Self::SignedInt, UnsignedInt = Self::Int>;
+
+    /// A int of the same width as the float
+    type SignedInt: Int + MinInt<OtherSign = Self::Int, UnsignedInt = Self::Int>;
+
+    /// An int capable of containing the exponent bits plus a sign bit. This is signed.
+    type ExpInt: Int;
+
+    const ZERO: Self;
+    const ONE: Self;
+
+    /// The bitwidth of the float type.
+    const BITS: u32;
+
+    /// The bitwidth of the significand.
+    const SIG_BITS: u32;
+
+    /// The bitwidth of the exponent.
+    const EXP_BITS: u32 = Self::BITS - Self::SIG_BITS - 1;
+
+    /// The saturated (maximum bitpattern) value of the exponent, i.e. the infinite
+    /// representation.
+    ///
+    /// This is in the rightmost position, use `EXP_MASK` for the shifted value.
+    const EXP_SAT: u32 = (1 << Self::EXP_BITS) - 1;
+
+    /// The exponent bias value.
+    const EXP_BIAS: u32 = Self::EXP_SAT >> 1;
+
+    /// A mask for the sign bit.
+    const SIGN_MASK: Self::Int;
+
+    /// A mask for the significand.
+    const SIG_MASK: Self::Int;
+
+    /// The implicit bit of the float format.
+    const IMPLICIT_BIT: Self::Int;
+
+    /// A mask for the exponent.
+    const EXP_MASK: Self::Int;
+
+    /// Returns `self` transmuted to `Self::Int`
+    fn to_bits(self) -> Self::Int;
+
+    /// Returns `self` transmuted to `Self::SignedInt`
+    fn to_bits_signed(self) -> Self::SignedInt;
+
+    /// Checks if two floats have the same bit representation. *Except* for NaNs! NaN can be
+    /// represented in multiple different ways. This method returns `true` if two NaNs are
+    /// compared.
+    fn eq_repr(self, rhs: Self) -> bool;
+
+    /// Returns true if the sign is negative
+    fn is_sign_negative(self) -> bool;
+
+    /// Returns the exponent, not adjusting for bias.
+    fn exp(self) -> Self::ExpInt;
+
+    /// Returns the significand with no implicit bit (or the "fractional" part)
+    fn frac(self) -> Self::Int;
+
+    /// Returns the significand with implicit bit
+    fn imp_frac(self) -> Self::Int;
+
+    /// Returns a `Self::Int` transmuted back to `Self`
+    fn from_bits(a: Self::Int) -> Self;
+
+    /// Constructs a `Self` from its parts. Inputs are treated as bits and shifted into position.
+    fn from_parts(negative: bool, exponent: Self::Int, significand: Self::Int) -> Self;
+
+    fn abs(self) -> Self {
+        let abs_mask = !Self::SIGN_MASK;
+        Self::from_bits(self.to_bits() & abs_mask)
+    }
+
+    /// Returns (normalized exponent, normalized significand)
+    fn normalize(significand: Self::Int) -> (i32, Self::Int);
+
+    /// Returns if `self` is subnormal
+    fn is_subnormal(self) -> bool;
+}
+
+macro_rules! float_impl {
+    ($ty:ident, $ity:ident, $sity:ident, $expty:ident, $bits:expr, $significand_bits:expr) => {
+        impl Float for $ty {
+            type Int = $ity;
+            type SignedInt = $sity;
+            type ExpInt = $expty;
+
+            const ZERO: Self = 0.0;
+            const ONE: Self = 1.0;
+
+            const BITS: u32 = $bits;
+            const SIG_BITS: u32 = $significand_bits;
+
+            const SIGN_MASK: Self::Int = 1 << (Self::BITS - 1);
+            const SIG_MASK: Self::Int = (1 << Self::SIG_BITS) - 1;
+            const IMPLICIT_BIT: Self::Int = 1 << Self::SIG_BITS;
+            const EXP_MASK: Self::Int = !(Self::SIGN_MASK | Self::SIG_MASK);
+
+            fn to_bits(self) -> Self::Int {
+                self.to_bits()
+            }
+            fn to_bits_signed(self) -> Self::SignedInt {
+                self.to_bits() as Self::SignedInt
+            }
+            fn eq_repr(self, rhs: Self) -> bool {
+                #[cfg(feature = "mangled-names")]
+                fn is_nan(x: $ty) -> bool {
+                    // When using mangled-names, the "real" compiler-builtins might not have the
+                    // necessary builtin (__unordtf2) to test whether `f128` is NaN.
+                    // FIXME(f16_f128): Remove once the nightly toolchain has the __unordtf2 builtin
+                    // x is NaN if all the bits of the exponent are set and the significand is non-0
+                    x.to_bits() & $ty::EXP_MASK == $ty::EXP_MASK && x.to_bits() & $ty::SIG_MASK != 0
+                }
+                #[cfg(not(feature = "mangled-names"))]
+                fn is_nan(x: $ty) -> bool {
+                    x.is_nan()
+                }
+                if is_nan(self) && is_nan(rhs) {
+                    true
+                } else {
+                    self.to_bits() == rhs.to_bits()
+                }
+            }
+            fn is_sign_negative(self) -> bool {
+                self.is_sign_negative()
+            }
+            fn exp(self) -> Self::ExpInt {
+                ((self.to_bits() & Self::EXP_MASK) >> Self::SIG_BITS) as Self::ExpInt
+            }
+            fn frac(self) -> Self::Int {
+                self.to_bits() & Self::SIG_MASK
+            }
+            fn imp_frac(self) -> Self::Int {
+                self.frac() | Self::IMPLICIT_BIT
+            }
+            fn from_bits(a: Self::Int) -> Self {
+                Self::from_bits(a)
+            }
+            fn from_parts(negative: bool, exponent: Self::Int, significand: Self::Int) -> Self {
+                Self::from_bits(
+                    ((negative as Self::Int) << (Self::BITS - 1))
+                        | ((exponent << Self::SIG_BITS) & Self::EXP_MASK)
+                        | (significand & Self::SIG_MASK),
+                )
+            }
+            fn normalize(significand: Self::Int) -> (i32, Self::Int) {
+                let shift = significand.leading_zeros().wrapping_sub(Self::EXP_BITS);
+                (
+                    1i32.wrapping_sub(shift as i32),
+                    significand << shift as Self::Int,
+                )
+            }
+            fn is_subnormal(self) -> bool {
+                (self.to_bits() & Self::EXP_MASK) == Self::Int::ZERO
+            }
+        }
+    };
+}
+
+#[cfg(f16_enabled)]
+float_impl!(f16, u16, i16, i8, 16, 10);
+float_impl!(f32, u32, i32, i16, 32, 23);
+float_impl!(f64, u64, i64, i16, 64, 52);
+#[cfg(f128_enabled)]
+float_impl!(f128, u128, i128, i16, 128, 112);
--- a/library/compiler-builtins/compiler-builtins/src/float/trunc.rs
+++ b/library/compiler-builtins/compiler-builtins/src/float/trunc.rs
@ -0,0 +1,169 @@
+use crate::float::Float;
+use crate::int::{CastInto, Int, MinInt};
+
+fn trunc<F: Float, R: Float>(a: F) -> R
+where
+    F::Int: CastInto<u64>,
+    F::Int: CastInto<u32>,
+    u64: CastInto<F::Int>,
+    u32: CastInto<F::Int>,
+    R::Int: CastInto<u32>,
+    u32: CastInto<R::Int>,
+    F::Int: CastInto<R::Int>,
+{
+    let src_zero = F::Int::ZERO;
+    let src_one = F::Int::ONE;
+    let src_bits = F::BITS;
+    let src_exp_bias = F::EXP_BIAS;
+
+    let src_min_normal = F::IMPLICIT_BIT;
+    let src_sig_mask = F::SIG_MASK;
+    let src_infinity = F::EXP_MASK;
+    let src_sign_mask = F::SIGN_MASK;
+    let src_abs_mask = src_sign_mask - src_one;
+    let round_mask = (src_one << (F::SIG_BITS - R::SIG_BITS)) - src_one;
+    let halfway = src_one << (F::SIG_BITS - R::SIG_BITS - 1);
+    let src_qnan = src_one << (F::SIG_BITS - 1);
+    let src_nan_code = src_qnan - src_one;
+
+    let dst_zero = R::Int::ZERO;
+    let dst_one = R::Int::ONE;
+    let dst_bits = R::BITS;
+    let dst_inf_exp = R::EXP_SAT;
+    let dst_exp_bias = R::EXP_BIAS;
+
+    let underflow_exponent: F::Int = (src_exp_bias + 1 - dst_exp_bias).cast();
+    let overflow_exponent: F::Int = (src_exp_bias + dst_inf_exp - dst_exp_bias).cast();
+    let underflow: F::Int = underflow_exponent << F::SIG_BITS;
+    let overflow: F::Int = overflow_exponent << F::SIG_BITS;
+
+    let dst_qnan = R::Int::ONE << (R::SIG_BITS - 1);
+    let dst_nan_code = dst_qnan - dst_one;
+
+    let sig_bits_delta = F::SIG_BITS - R::SIG_BITS;
+    // Break a into a sign and representation of the absolute value.
+    let a_abs = a.to_bits() & src_abs_mask;
+    let sign = a.to_bits() & src_sign_mask;
+    let mut abs_result: R::Int;
+
+    if a_abs.wrapping_sub(underflow) < a_abs.wrapping_sub(overflow) {
+        // The exponent of a is within the range of normal numbers in the
+        // destination format.  We can convert by simply right-shifting with
+        // rounding and adjusting the exponent.
+        abs_result = (a_abs >> sig_bits_delta).cast();
+        // Cast before shifting to prevent overflow.
+        let bias_diff: R::Int = src_exp_bias.wrapping_sub(dst_exp_bias).cast();
+        let tmp = bias_diff << R::SIG_BITS;
+        abs_result = abs_result.wrapping_sub(tmp);
+
+        let round_bits = a_abs & round_mask;
+        if round_bits > halfway {
+            // Round to nearest.
+            abs_result += dst_one;
+        } else if round_bits == halfway {
+            // Tie to even.
+            abs_result += abs_result & dst_one;
+        };
+    } else if a_abs > src_infinity {
+        // a is NaN.
+        // Conjure the result by beginning with infinity, setting the qNaN
+        // bit and inserting the (truncated) trailing NaN field.
+        // Cast before shifting to prevent overflow.
+        let dst_inf_exp: R::Int = dst_inf_exp.cast();
+        abs_result = dst_inf_exp << R::SIG_BITS;
+        abs_result |= dst_qnan;
+        abs_result |= dst_nan_code & ((a_abs & src_nan_code) >> (F::SIG_BITS - R::SIG_BITS)).cast();
+    } else if a_abs >= overflow {
+        // a overflows to infinity.
+        // Cast before shifting to prevent overflow.
+        let dst_inf_exp: R::Int = dst_inf_exp.cast();
+        abs_result = dst_inf_exp << R::SIG_BITS;
+    } else {
+        // a underflows on conversion to the destination type or is an exact
+        // zero.  The result may be a denormal or zero.  Extract the exponent
+        // to get the shift amount for the denormalization.
+        let a_exp: u32 = (a_abs >> F::SIG_BITS).cast();
+        let shift = src_exp_bias - dst_exp_bias - a_exp + 1;
+
+        let significand = (a.to_bits() & src_sig_mask) | src_min_normal;
+
+        // Right shift by the denormalization amount with sticky.
+        if shift > F::SIG_BITS {
+            abs_result = dst_zero;
+        } else {
+            let sticky = if (significand << (src_bits - shift)) != src_zero {
+                src_one
+            } else {
+                src_zero
+            };
+            let denormalized_significand: F::Int = (significand >> shift) | sticky;
+            abs_result = (denormalized_significand >> (F::SIG_BITS - R::SIG_BITS)).cast();
+            let round_bits = denormalized_significand & round_mask;
+            // Round to nearest
+            if round_bits > halfway {
+                abs_result += dst_one;
+            }
+            // Ties to even
+            else if round_bits == halfway {
+                abs_result += abs_result & dst_one;
+            };
+        }
+    }
+
+    // Apply the signbit to the absolute value.
+    R::from_bits(abs_result | sign.wrapping_shr(src_bits - dst_bits).cast())
+}
+
+intrinsics! {
+    #[aapcs_on_arm]
+    #[arm_aeabi_alias = __aeabi_d2f]
+    pub extern "C" fn __truncdfsf2(a: f64) -> f32 {
+        trunc(a)
+    }
+}
+
+intrinsics! {
+    #[aapcs_on_arm]
+    #[apple_f16_ret_abi]
+    #[arm_aeabi_alias = __aeabi_f2h]
+    #[cfg(f16_enabled)]
+    pub extern "C" fn __truncsfhf2(a: f32) -> f16 {
+        trunc(a)
+    }
+
+    #[aapcs_on_arm]
+    #[apple_f16_ret_abi]
+    #[cfg(f16_enabled)]
+    pub extern "C" fn __gnu_f2h_ieee(a: f32) -> f16 {
+        trunc(a)
+    }
+
+    #[aapcs_on_arm]
+    #[apple_f16_ret_abi]
+    #[arm_aeabi_alias = __aeabi_d2h]
+    #[cfg(f16_enabled)]
+    pub extern "C" fn __truncdfhf2(a: f64) -> f16 {
+        trunc(a)
+    }
+
+    #[aapcs_on_arm]
+    #[ppc_alias = __trunckfhf2]
+    #[cfg(all(f16_enabled, f128_enabled))]
+    pub extern "C" fn __trunctfhf2(a: f128) -> f16 {
+        trunc(a)
+    }
+
+    #[aapcs_on_arm]
+    #[ppc_alias = __trunckfsf2]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __trunctfsf2(a: f128) -> f32 {
+        trunc(a)
+    }
+
+    #[aapcs_on_arm]
+    #[ppc_alias = __trunckfdf2]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __trunctfdf2(a: f128) -> f64 {
+        trunc(a)
+    }
+}
--- a/library/compiler-builtins/compiler-builtins/src/hexagon.rs
+++ b/library/compiler-builtins/compiler-builtins/src/hexagon.rs
@ -0,0 +1,55 @@
+#![cfg(not(feature = "no-asm"))]
+
+use core::arch::global_asm;
+
+global_asm!(include_str!("hexagon/func_macro.s"), options(raw));
+
+global_asm!(include_str!("hexagon/dfaddsub.s"), options(raw));
+
+global_asm!(include_str!("hexagon/dfdiv.s"), options(raw));
+
+global_asm!(include_str!("hexagon/dffma.s"), options(raw));
+
+global_asm!(include_str!("hexagon/dfminmax.s"), options(raw));
+
+global_asm!(include_str!("hexagon/dfmul.s"), options(raw));
+
+global_asm!(include_str!("hexagon/dfsqrt.s"), options(raw));
+
+global_asm!(include_str!("hexagon/divdi3.s"), options(raw));
+
+global_asm!(include_str!("hexagon/divsi3.s"), options(raw));
+
+global_asm!(include_str!("hexagon/fastmath2_dlib_asm.s"), options(raw));
+
+global_asm!(include_str!("hexagon/fastmath2_ldlib_asm.s"), options(raw));
+
+global_asm!(
+    include_str!("hexagon/memcpy_forward_vp4cp4n2.s"),
+    options(raw)
+);
+
+global_asm!(
+    include_str!("hexagon/memcpy_likely_aligned.s"),
+    options(raw)
+);
+
+global_asm!(include_str!("hexagon/moddi3.s"), options(raw));
+
+global_asm!(include_str!("hexagon/modsi3.s"), options(raw));
+
+global_asm!(include_str!("hexagon/sfdiv_opt.s"), options(raw));
+
+global_asm!(include_str!("hexagon/sfsqrt_opt.s"), options(raw));
+
+global_asm!(include_str!("hexagon/udivdi3.s"), options(raw));
+
+global_asm!(include_str!("hexagon/udivmoddi4.s"), options(raw));
+
+global_asm!(include_str!("hexagon/udivmodsi4.s"), options(raw));
+
+global_asm!(include_str!("hexagon/udivsi3.s"), options(raw));
+
+global_asm!(include_str!("hexagon/umoddi3.s"), options(raw));
+
+global_asm!(include_str!("hexagon/umodsi3.s"), options(raw));
--- a/library/compiler-builtins/compiler-builtins/src/hexagon/dfaddsub.s
+++ b/library/compiler-builtins/compiler-builtins/src/hexagon/dfaddsub.s
@ -0,0 +1,321 @@
+ .text
+ .global __hexagon_adddf3
+ .global __hexagon_subdf3
+ .type __hexagon_adddf3, @function
+ .type __hexagon_subdf3, @function
+
+.global __qdsp_adddf3 ; .set __qdsp_adddf3, __hexagon_adddf3
+.global __hexagon_fast_adddf3 ; .set __hexagon_fast_adddf3, __hexagon_adddf3
+.global __hexagon_fast2_adddf3 ; .set __hexagon_fast2_adddf3, __hexagon_adddf3
+.global __qdsp_subdf3 ; .set __qdsp_subdf3, __hexagon_subdf3
+.global __hexagon_fast_subdf3 ; .set __hexagon_fast_subdf3, __hexagon_subdf3
+.global __hexagon_fast2_subdf3 ; .set __hexagon_fast2_subdf3, __hexagon_subdf3
+
+ .p2align 5
+__hexagon_adddf3:
+ {
+  r4 = extractu(r1,#11,#20)
+  r5 = extractu(r3,#11,#20)
+  r13:12 = combine(##0x20000000,#0)
+ }
+ {
+  p3 = dfclass(r1:0,#2)
+  p3 = dfclass(r3:2,#2)
+  r9:8 = r13:12
+  p2 = cmp.gtu(r5,r4)
+ }
+ {
+  if (!p3) jump .Ladd_abnormal
+  if (p2) r1:0 = r3:2
+  if (p2) r3:2 = r1:0
+  if (p2) r5:4 = combine(r4,r5)
+ }
+ {
+  r13:12 = insert(r1:0,#52,#11 -2)
+  r9:8 = insert(r3:2,#52,#11 -2)
+  r15 = sub(r4,r5)
+  r7:6 = combine(#62,#1)
+ }
+
+
+
+
+
+.Ladd_continue:
+ {
+  r15 = min(r15,r7)
+
+  r11:10 = neg(r13:12)
+  p2 = cmp.gt(r1,#-1)
+  r14 = #0
+ }
+ {
+  if (!p2) r13:12 = r11:10
+  r11:10 = extractu(r9:8,r15:14)
+  r9:8 = ASR(r9:8,r15)
+
+
+
+
+  r15:14 = #0
+ }
+ {
+  p1 = cmp.eq(r11:10,r15:14)
+  if (!p1.new) r8 = or(r8,r6)
+  r5 = add(r4,#-1024 -60)
+  p3 = cmp.gt(r3,#-1)
+ }
+ {
+  r13:12 = add(r13:12,r9:8)
+  r11:10 = sub(r13:12,r9:8)
+  r7:6 = combine(#54,##2045)
+ }
+ {
+  p0 = cmp.gtu(r4,r7)
+  p0 = !cmp.gtu(r4,r6)
+  if (!p0.new) jump:nt .Ladd_ovf_unf
+  if (!p3) r13:12 = r11:10
+ }
+ {
+  r1:0 = convert_d2df(r13:12)
+  p0 = cmp.eq(r13,#0)
+  p0 = cmp.eq(r12,#0)
+  if (p0.new) jump:nt .Ladd_zero
+ }
+ {
+  r1 += asl(r5,#20)
+  jumpr r31
+ }
+ .falign
+__hexagon_subdf3:
+ {
+  r3 = togglebit(r3,#31)
+  jump __qdsp_adddf3
+ }
+
+
+ .falign
+.Ladd_zero:
+
+
+ {
+  r28 = USR
+  r1:0 = #0
+  r3 = #1
+ }
+ {
+  r28 = extractu(r28,#2,#22)
+  r3 = asl(r3,#31)
+ }
+ {
+  p0 = cmp.eq(r28,#2)
+  if (p0.new) r1 = xor(r1,r3)
+  jumpr r31
+ }
+ .falign
+.Ladd_ovf_unf:
+ {
+  r1:0 = convert_d2df(r13:12)
+  p0 = cmp.eq(r13,#0)
+  p0 = cmp.eq(r12,#0)
+  if (p0.new) jump:nt .Ladd_zero
+ }
+ {
+  r28 = extractu(r1,#11,#20)
+  r1 += asl(r5,#20)
+ }
+ {
+  r5 = add(r5,r28)
+  r3:2 = combine(##0x00100000,#0)
+ }
+ {
+  p0 = cmp.gt(r5,##1024 +1024 -2)
+  if (p0.new) jump:nt .Ladd_ovf
+ }
+ {
+  p0 = cmp.gt(r5,#0)
+  if (p0.new) jumpr:t r31
+  r28 = sub(#1,r5)
+ }
+ {
+  r3:2 = insert(r1:0,#52,#0)
+  r1:0 = r13:12
+ }
+ {
+  r3:2 = lsr(r3:2,r28)
+ }
+ {
+  r1:0 = insert(r3:2,#63,#0)
+  jumpr r31
+ }
+ .falign
+.Ladd_ovf:
+
+ {
+  r1:0 = r13:12
+  r28 = USR
+  r13:12 = combine(##0x7fefffff,#-1)
+ }
+ {
+  r5 = extractu(r28,#2,#22)
+  r28 = or(r28,#0x28)
+  r9:8 = combine(##0x7ff00000,#0)
+ }
+ {
+  USR = r28
+  r5 ^= lsr(r1,#31)
+  r28 = r5
+ }
+ {
+  p0 = !cmp.eq(r28,#1)
+  p0 = !cmp.eq(r5,#2)
+  if (p0.new) r13:12 = r9:8
+ }
+ {
+  r1:0 = insert(r13:12,#63,#0)
+ }
+ {
+  p0 = dfcmp.eq(r1:0,r1:0)
+  jumpr r31
+ }
+
+.Ladd_abnormal:
+ {
+  r13:12 = extractu(r1:0,#63,#0)
+  r9:8 = extractu(r3:2,#63,#0)
+ }
+ {
+  p3 = cmp.gtu(r13:12,r9:8)
+  if (!p3.new) r1:0 = r3:2
+  if (!p3.new) r3:2 = r1:0
+ }
+ {
+
+  p0 = dfclass(r1:0,#0x0f)
+  if (!p0.new) jump:nt .Linvalid_nan_add
+  if (!p3) r13:12 = r9:8
+  if (!p3) r9:8 = r13:12
+ }
+ {
+
+
+  p1 = dfclass(r1:0,#0x08)
+  if (p1.new) jump:nt .Linf_add
+ }
+ {
+  p2 = dfclass(r3:2,#0x01)
+  if (p2.new) jump:nt .LB_zero
+  r13:12 = #0
+ }
+
+ {
+  p0 = dfclass(r1:0,#4)
+  if (p0.new) jump:nt .Ladd_two_subnormal
+  r13:12 = combine(##0x20000000,#0)
+ }
+ {
+  r4 = extractu(r1,#11,#20)
+  r5 = #1
+
+  r9:8 = asl(r9:8,#11 -2)
+ }
+
+
+
+ {
+  r13:12 = insert(r1:0,#52,#11 -2)
+  r15 = sub(r4,r5)
+  r7:6 = combine(#62,#1)
+  jump .Ladd_continue
+ }
+
+.Ladd_two_subnormal:
+ {
+  r13:12 = extractu(r1:0,#63,#0)
+  r9:8 = extractu(r3:2,#63,#0)
+ }
+ {
+  r13:12 = neg(r13:12)
+  r9:8 = neg(r9:8)
+  p0 = cmp.gt(r1,#-1)
+  p1 = cmp.gt(r3,#-1)
+ }
+ {
+  if (p0) r13:12 = r1:0
+  if (p1) r9:8 = r3:2
+ }
+ {
+  r13:12 = add(r13:12,r9:8)
+ }
+ {
+  r9:8 = neg(r13:12)
+  p0 = cmp.gt(r13,#-1)
+  r3:2 = #0
+ }
+ {
+  if (!p0) r1:0 = r9:8
+  if (p0) r1:0 = r13:12
+  r3 = ##0x80000000
+ }
+ {
+  if (!p0) r1 = or(r1,r3)
+  p0 = dfcmp.eq(r1:0,r3:2)
+  if (p0.new) jump:nt .Lzero_plus_zero
+ }
+ {
+  jumpr r31
+ }
+
+.Linvalid_nan_add:
+ {
+  r28 = convert_df2sf(r1:0)
+  p0 = dfclass(r3:2,#0x0f)
+  if (p0.new) r3:2 = r1:0
+ }
+ {
+  r2 = convert_df2sf(r3:2)
+  r1:0 = #-1
+  jumpr r31
+ }
+ .falign
+.LB_zero:
+ {
+  p0 = dfcmp.eq(r13:12,r1:0)
+  if (!p0.new) jumpr:t r31
+ }
+
+
+
+
+.Lzero_plus_zero:
+ {
+  p0 = cmp.eq(r1:0,r3:2)
+  if (p0.new) jumpr:t r31
+ }
+ {
+  r28 = USR
+ }
+ {
+  r28 = extractu(r28,#2,#22)
+  r1:0 = #0
+ }
+ {
+  p0 = cmp.eq(r28,#2)
+  if (p0.new) r1 = ##0x80000000
+  jumpr r31
+ }
+.Linf_add:
+
+ {
+  p0 = !cmp.eq(r1,r3)
+  p0 = dfclass(r3:2,#8)
+  if (!p0.new) jumpr:t r31
+ }
+ {
+  r2 = ##0x7f800001
+ }
+ {
+  r1:0 = convert_sf2df(r2)
+  jumpr r31
+ }
+.size __hexagon_adddf3,.-__hexagon_adddf3
--- a/library/compiler-builtins/compiler-builtins/src/hexagon/dfdiv.s
+++ b/library/compiler-builtins/compiler-builtins/src/hexagon/dfdiv.s
@ -0,0 +1,372 @@
+ .text
+ .global __hexagon_divdf3
+ .type __hexagon_divdf3,@function
+ .global __qdsp_divdf3 ; .set __qdsp_divdf3, __hexagon_divdf3
+        .global __hexagon_fast_divdf3 ; .set __hexagon_fast_divdf3, __hexagon_divdf3
+        .global __hexagon_fast2_divdf3 ; .set __hexagon_fast2_divdf3, __hexagon_divdf3
+ .p2align 5
+__hexagon_divdf3:
+ {
+  p2 = dfclass(r1:0,#0x02)
+  p2 = dfclass(r3:2,#0x02)
+  r13:12 = combine(r3,r1)
+  r28 = xor(r1,r3)
+ }
+ {
+  if (!p2) jump .Ldiv_abnormal
+  r7:6 = extractu(r3:2,#23,#52 -23)
+  r8 = ##0x3f800001
+ }
+ {
+  r9 = or(r8,r6)
+  r13 = extractu(r13,#11,#52 -32)
+  r12 = extractu(r12,#11,#52 -32)
+  p3 = cmp.gt(r28,#-1)
+ }
+
+
+.Ldenorm_continue:
+ {
+  r11,p0 = sfrecipa(r8,r9)
+  r10 = and(r8,#-2)
+  r28 = #1
+  r12 = sub(r12,r13)
+ }
+
+
+ {
+  r10 -= sfmpy(r11,r9):lib
+  r1 = insert(r28,#11 +1,#52 -32)
+  r13 = ##0x00800000 << 3
+ }
+ {
+  r11 += sfmpy(r11,r10):lib
+  r3 = insert(r28,#11 +1,#52 -32)
+  r10 = and(r8,#-2)
+ }
+ {
+  r10 -= sfmpy(r11,r9):lib
+  r5 = #-0x3ff +1
+  r4 = #0x3ff -1
+ }
+ {
+  r11 += sfmpy(r11,r10):lib
+  p1 = cmp.gt(r12,r5)
+  p1 = !cmp.gt(r12,r4)
+ }
+ {
+  r13 = insert(r11,#23,#3)
+  r5:4 = #0
+  r12 = add(r12,#-61)
+ }
+
+
+
+
+ {
+  r13 = add(r13,#((-3) << 3))
+ }
+ { r7:6 = mpyu(r13,r1); r1:0 = asl(r1:0,# ( 15 )); }; { r6 = # 0; r1:0 -= mpyu(r7,r2); r15:14 = mpyu(r7,r3); }; { r5:4 += ASL(r7:6, # ( 14 )); r1:0 -= asl(r15:14, # 32); }
+ { r7:6 = mpyu(r13,r1); r1:0 = asl(r1:0,# ( 15 )); }; { r6 = # 0; r1:0 -= mpyu(r7,r2); r15:14 = mpyu(r7,r3); }; { r5:4 += ASR(r7:6, # ( 1 )); r1:0 -= asl(r15:14, # 32); }
+ { r7:6 = mpyu(r13,r1); r1:0 = asl(r1:0,# ( 15 )); }; { r6 = # 0; r1:0 -= mpyu(r7,r2); r15:14 = mpyu(r7,r3); }; { r5:4 += ASR(r7:6, # ( 16 )); r1:0 -= asl(r15:14, # 32); }
+ { r7:6 = mpyu(r13,r1); r1:0 = asl(r1:0,# ( 15 )); }; { r6 = # 0; r1:0 -= mpyu(r7,r2); r15:14 = mpyu(r7,r3); }; { r5:4 += ASR(r7:6, # ( 31 )); r1:0 -= asl(r15:14, # 32); r7:6=# ( 0 ); }
+
+
+
+
+
+
+
+ {
+
+  r15:14 = sub(r1:0,r3:2)
+  p0 = cmp.gtu(r3:2,r1:0)
+
+  if (!p0.new) r6 = #2
+ }
+ {
+  r5:4 = add(r5:4,r7:6)
+  if (!p0) r1:0 = r15:14
+  r15:14 = #0
+ }
+ {
+  p0 = cmp.eq(r1:0,r15:14)
+  if (!p0.new) r4 = or(r4,r28)
+ }
+ {
+  r7:6 = neg(r5:4)
+ }
+ {
+  if (!p3) r5:4 = r7:6
+ }
+ {
+  r1:0 = convert_d2df(r5:4)
+  if (!p1) jump .Ldiv_ovf_unf
+ }
+ {
+  r1 += asl(r12,#52 -32)
+  jumpr r31
+ }
+
+.Ldiv_ovf_unf:
+ {
+  r1 += asl(r12,#52 -32)
+  r13 = extractu(r1,#11,#52 -32)
+ }
+ {
+  r7:6 = abs(r5:4)
+  r12 = add(r12,r13)
+ }
+ {
+  p0 = cmp.gt(r12,##0x3ff +0x3ff)
+  if (p0.new) jump:nt .Ldiv_ovf
+ }
+ {
+  p0 = cmp.gt(r12,#0)
+  if (p0.new) jump:nt .Lpossible_unf2
+ }
+ {
+  r13 = add(clb(r7:6),#-1)
+  r12 = sub(#7,r12)
+  r10 = USR
+  r11 = #63
+ }
+ {
+  r13 = min(r12,r11)
+  r11 = or(r10,#0x030)
+  r7:6 = asl(r7:6,r13)
+  r12 = #0
+ }
+ {
+  r15:14 = extractu(r7:6,r13:12)
+  r7:6 = lsr(r7:6,r13)
+  r3:2 = #1
+ }
+ {
+  p0 = cmp.gtu(r3:2,r15:14)
+  if (!p0.new) r6 = or(r2,r6)
+  r7 = setbit(r7,#52 -32+4)
+ }
+ {
+  r5:4 = neg(r7:6)
+  p0 = bitsclr(r6,#(1<<4)-1)
+  if (!p0.new) r10 = r11
+ }
+ {
+  USR = r10
+  if (p3) r5:4 = r7:6
+  r10 = #-0x3ff -(52 +4)
+ }
+ {
+  r1:0 = convert_d2df(r5:4)
+ }
+ {
+  r1 += asl(r10,#52 -32)
+  jumpr r31
+ }
+
+
+.Lpossible_unf2:
+
+
+ {
+  r3:2 = extractu(r1:0,#63,#0)
+  r15:14 = combine(##0x00100000,#0)
+  r10 = #0x7FFF
+ }
+ {
+  p0 = dfcmp.eq(r15:14,r3:2)
+  p0 = bitsset(r7,r10)
+ }
+
+
+
+
+
+
+ {
+  if (!p0) jumpr r31
+  r10 = USR
+ }
+
+ {
+  r10 = or(r10,#0x30)
+ }
+ {
+  USR = r10
+ }
+ {
+  p0 = dfcmp.eq(r1:0,r1:0)
+  jumpr r31
+ }
+
+.Ldiv_ovf:
+
+
+
+ {
+  r10 = USR
+  r3:2 = combine(##0x7fefffff,#-1)
+  r1 = mux(p3,#0,#-1)
+ }
+ {
+  r7:6 = combine(##0x7ff00000,#0)
+  r5 = extractu(r10,#2,#22)
+  r10 = or(r10,#0x28)
+ }
+ {
+  USR = r10
+  r5 ^= lsr(r1,#31)
+  r4 = r5
+ }
+ {
+  p0 = !cmp.eq(r4,#1)
+  p0 = !cmp.eq(r5,#2)
+  if (p0.new) r3:2 = r7:6
+  p0 = dfcmp.eq(r3:2,r3:2)
+ }
+ {
+  r1:0 = insert(r3:2,#63,#0)
+  jumpr r31
+ }
+
+
+
+
+
+
+
+.Ldiv_abnormal:
+ {
+  p0 = dfclass(r1:0,#0x0F)
+  p0 = dfclass(r3:2,#0x0F)
+  p3 = cmp.gt(r28,#-1)
+ }
+ {
+  p1 = dfclass(r1:0,#0x08)
+  p1 = dfclass(r3:2,#0x08)
+ }
+ {
+  p2 = dfclass(r1:0,#0x01)
+  p2 = dfclass(r3:2,#0x01)
+ }
+ {
+  if (!p0) jump .Ldiv_nan
+  if (p1) jump .Ldiv_invalid
+ }
+ {
+  if (p2) jump .Ldiv_invalid
+ }
+ {
+  p2 = dfclass(r1:0,#(0x0F ^ 0x01))
+  p2 = dfclass(r3:2,#(0x0F ^ 0x08))
+ }
+ {
+  p1 = dfclass(r1:0,#(0x0F ^ 0x08))
+  p1 = dfclass(r3:2,#(0x0F ^ 0x01))
+ }
+ {
+  if (!p2) jump .Ldiv_zero_result
+  if (!p1) jump .Ldiv_inf_result
+ }
+
+
+
+
+
+ {
+  p0 = dfclass(r1:0,#0x02)
+  p1 = dfclass(r3:2,#0x02)
+  r10 = ##0x00100000
+ }
+ {
+  r13:12 = combine(r3,r1)
+  r1 = insert(r10,#11 +1,#52 -32)
+  r3 = insert(r10,#11 +1,#52 -32)
+ }
+ {
+  if (p0) r1 = or(r1,r10)
+  if (p1) r3 = or(r3,r10)
+ }
+ {
+  r5 = add(clb(r1:0),#-11)
+  r4 = add(clb(r3:2),#-11)
+  r10 = #1
+ }
+ {
+  r12 = extractu(r12,#11,#52 -32)
+  r13 = extractu(r13,#11,#52 -32)
+ }
+ {
+  r1:0 = asl(r1:0,r5)
+  r3:2 = asl(r3:2,r4)
+  if (!p0) r12 = sub(r10,r5)
+  if (!p1) r13 = sub(r10,r4)
+ }
+ {
+  r7:6 = extractu(r3:2,#23,#52 -23)
+ }
+ {
+  r9 = or(r8,r6)
+  jump .Ldenorm_continue
+ }
+
+.Ldiv_zero_result:
+ {
+  r1 = xor(r1,r3)
+  r3:2 = #0
+ }
+ {
+  r1:0 = insert(r3:2,#63,#0)
+  jumpr r31
+ }
+.Ldiv_inf_result:
+ {
+  p2 = dfclass(r3:2,#0x01)
+  p2 = dfclass(r1:0,#(0x0F ^ 0x08))
+ }
+ {
+  r10 = USR
+  if (!p2) jump 1f
+  r1 = xor(r1,r3)
+ }
+ {
+  r10 = or(r10,#0x04)
+ }
+ {
+  USR = r10
+ }
+1:
+ {
+  r3:2 = combine(##0x7ff00000,#0)
+  p0 = dfcmp.uo(r3:2,r3:2)
+ }
+ {
+  r1:0 = insert(r3:2,#63,#0)
+  jumpr r31
+ }
+.Ldiv_nan:
+ {
+  p0 = dfclass(r1:0,#0x10)
+  p1 = dfclass(r3:2,#0x10)
+  if (!p0.new) r1:0 = r3:2
+  if (!p1.new) r3:2 = r1:0
+ }
+ {
+  r5 = convert_df2sf(r1:0)
+  r4 = convert_df2sf(r3:2)
+ }
+ {
+  r1:0 = #-1
+  jumpr r31
+ }
+
+.Ldiv_invalid:
+ {
+  r10 = ##0x7f800001
+ }
+ {
+  r1:0 = convert_sf2df(r10)
+  jumpr r31
+ }
+.size __hexagon_divdf3,.-__hexagon_divdf3
--- a/library/compiler-builtins/compiler-builtins/src/hexagon/dffma.s
+++ b/library/compiler-builtins/compiler-builtins/src/hexagon/dffma.s
@ -0,0 +1,534 @@
+ .text
+ .global __hexagon_fmadf4
+        .type __hexagon_fmadf4,@function
+ .global __hexagon_fmadf5
+        .type __hexagon_fmadf5,@function
+ .global __qdsp_fmadf5 ; .set __qdsp_fmadf5, __hexagon_fmadf5
+ .p2align 5
+__hexagon_fmadf4:
+__hexagon_fmadf5:
+fma:
+ {
+  p0 = dfclass(r1:0,#2)
+  p0 = dfclass(r3:2,#2)
+  r13:12 = #0
+  r15:14 = #0
+ }
+ {
+  r13:12 = insert(r1:0,#52,#11 -3)
+  r15:14 = insert(r3:2,#52,#11 -3)
+  r7 = ##0x10000000
+  allocframe(#32)
+ }
+ {
+  r9:8 = mpyu(r12,r14)
+  if (!p0) jump .Lfma_abnormal_ab
+  r13 = or(r13,r7)
+  r15 = or(r15,r7)
+ }
+ {
+  p0 = dfclass(r5:4,#2)
+  if (!p0.new) jump:nt .Lfma_abnormal_c
+  r11:10 = combine(r7,#0)
+  r7:6 = combine(#0,r9)
+ }
+.Lfma_abnormal_c_restart:
+ {
+  r7:6 += mpyu(r14,r13)
+  r11:10 = insert(r5:4,#52,#11 -3)
+  memd(r29+#0) = r17:16
+  memd(r29+#8) = r19:18
+ }
+ {
+  r7:6 += mpyu(r12,r15)
+  r19:18 = neg(r11:10)
+  p0 = cmp.gt(r5,#-1)
+  r28 = xor(r1,r3)
+ }
+ {
+  r18 = extractu(r1,#11,#20)
+  r19 = extractu(r3,#11,#20)
+  r17:16 = combine(#0,r7)
+  if (!p0) r11:10 = r19:18
+ }
+ {
+  r17:16 += mpyu(r13,r15)
+  r9:8 = combine(r6,r8)
+  r18 = add(r18,r19)
+
+
+
+
+  r19 = extractu(r5,#11,#20)
+ }
+ {
+  r18 = add(r18,#-1023 +(4))
+  p3 = !cmp.gt(r28,#-1)
+  r7:6 = #0
+  r15:14 = #0
+ }
+ {
+  r7:6 = sub(r7:6,r9:8,p3):carry
+  p0 = !cmp.gt(r28,#-1)
+  p1 = cmp.gt(r19,r18)
+  if (p1.new) r19:18 = combine(r18,r19)
+ }
+ {
+  r15:14 = sub(r15:14,r17:16,p3):carry
+  if (p0) r9:8 = r7:6
+
+
+
+
+  r7:6 = #0
+  r19 = sub(r18,r19)
+ }
+ {
+  if (p0) r17:16 = r15:14
+  p0 = cmp.gt(r19,#63)
+  if (p1) r9:8 = r7:6
+  if (p1) r7:6 = r9:8
+ }
+
+
+
+
+
+
+
+ {
+  if (p1) r17:16 = r11:10
+  if (p1) r11:10 = r17:16
+  if (p0) r19 = add(r19,#-64)
+  r28 = #63
+ }
+ {
+
+  if (p0) r7:6 = r11:10
+  r28 = asr(r11,#31)
+  r13 = min(r19,r28)
+  r12 = #0
+ }
+
+
+
+
+
+
+ {
+  if (p0) r11:10 = combine(r28,r28)
+  r5:4 = extract(r7:6,r13:12)
+  r7:6 = lsr(r7:6,r13)
+  r12 = sub(#64,r13)
+ }
+ {
+  r15:14 = #0
+  r28 = #-2
+  r7:6 |= lsl(r11:10,r12)
+  r11:10 = asr(r11:10,r13)
+ }
+ {
+  p3 = cmp.gtu(r5:4,r15:14)
+  if (p3.new) r6 = and(r6,r28)
+
+
+
+  r15:14 = #1
+  r5:4 = #0
+ }
+ {
+  r9:8 = add(r7:6,r9:8,p3):carry
+ }
+ {
+  r17:16 = add(r11:10,r17:16,p3):carry
+  r28 = #62
+ }
+
+
+
+
+
+
+
+ {
+  r12 = add(clb(r17:16),#-2)
+  if (!cmp.eq(r12.new,r28)) jump:t 1f
+ }
+
+ {
+  r11:10 = extractu(r9:8,#62,#2)
+  r9:8 = asl(r9:8,#62)
+  r18 = add(r18,#-62)
+ }
+ {
+  r17:16 = insert(r11:10,#62,#0)
+ }
+ {
+  r12 = add(clb(r17:16),#-2)
+ }
+ .falign
+1:
+ {
+  r11:10 = asl(r17:16,r12)
+  r5:4 |= asl(r9:8,r12)
+  r13 = sub(#64,r12)
+  r18 = sub(r18,r12)
+ }
+ {
+  r11:10 |= lsr(r9:8,r13)
+  p2 = cmp.gtu(r15:14,r5:4)
+  r28 = #1023 +1023 -2
+ }
+ {
+  if (!p2) r10 = or(r10,r14)
+
+  p0 = !cmp.gt(r18,r28)
+  p0 = cmp.gt(r18,#1)
+  if (!p0.new) jump:nt .Lfma_ovf_unf
+ }
+ {
+
+  p0 = cmp.gtu(r15:14,r11:10)
+  r1:0 = convert_d2df(r11:10)
+  r18 = add(r18,#-1023 -60)
+  r17:16 = memd(r29+#0)
+ }
+ {
+  r1 += asl(r18,#20)
+  r19:18 = memd(r29+#8)
+  if (!p0) dealloc_return
+ }
+.Ladd_yields_zero:
+
+ {
+  r28 = USR
+  r1:0 = #0
+ }
+ {
+  r28 = extractu(r28,#2,#22)
+  r17:16 = memd(r29+#0)
+  r19:18 = memd(r29+#8)
+ }
+ {
+  p0 = cmp.eq(r28,#2)
+  if (p0.new) r1 = ##0x80000000
+  dealloc_return
+ }
+.Lfma_ovf_unf:
+ {
+  p0 = cmp.gtu(r15:14,r11:10)
+  if (p0.new) jump:nt .Ladd_yields_zero
+ }
+ {
+  r1:0 = convert_d2df(r11:10)
+  r18 = add(r18,#-1023 -60)
+  r28 = r18
+ }
+
+
+ {
+  r1 += asl(r18,#20)
+  r7 = extractu(r1,#11,#20)
+ }
+ {
+  r6 = add(r18,r7)
+  r17:16 = memd(r29+#0)
+  r19:18 = memd(r29+#8)
+  r9:8 = abs(r11:10)
+ }
+ {
+  p0 = cmp.gt(r6,##1023 +1023)
+  if (p0.new) jump:nt .Lfma_ovf
+ }
+ {
+  p0 = cmp.gt(r6,#0)
+  if (p0.new) jump:nt .Lpossible_unf0
+ }
+ {
+
+
+
+  r7 = add(clb(r9:8),#-2)
+  r6 = sub(#1+5,r28)
+  p3 = cmp.gt(r11,#-1)
+ }
+
+
+
+ {
+  r6 = add(r6,r7)
+  r9:8 = asl(r9:8,r7)
+  r1 = USR
+  r28 = #63
+ }
+ {
+  r7 = min(r6,r28)
+  r6 = #0
+  r0 = #0x0030
+ }
+ {
+  r3:2 = extractu(r9:8,r7:6)
+  r9:8 = asr(r9:8,r7)
+ }
+ {
+  p0 = cmp.gtu(r15:14,r3:2)
+  if (!p0.new) r8 = or(r8,r14)
+  r9 = setbit(r9,#20 +3)
+ }
+ {
+  r11:10 = neg(r9:8)
+  p1 = bitsclr(r8,#(1<<3)-1)
+  if (!p1.new) r1 = or(r1,r0)
+  r3:2 = #0
+ }
+ {
+  if (p3) r11:10 = r9:8
+  USR = r1
+  r28 = #-1023 -(52 +3)
+ }
+ {
+  r1:0 = convert_d2df(r11:10)
+ }
+ {
+  r1 += asl(r28,#20)
+  dealloc_return
+ }
+.Lpossible_unf0:
+ {
+  r28 = ##0x7fefffff
+  r9:8 = abs(r11:10)
+ }
+ {
+  p0 = cmp.eq(r0,#0)
+  p0 = bitsclr(r1,r28)
+  if (!p0.new) dealloc_return:t
+  r28 = #0x7fff
+ }
+ {
+  p0 = bitsset(r9,r28)
+  r3 = USR
+  r2 = #0x0030
+ }
+ {
+  if (p0) r3 = or(r3,r2)
+ }
+ {
+  USR = r3
+ }
+ {
+  p0 = dfcmp.eq(r1:0,r1:0)
+  dealloc_return
+ }
+.Lfma_ovf:
+ {
+  r28 = USR
+  r11:10 = combine(##0x7fefffff,#-1)
+  r1:0 = r11:10
+ }
+ {
+  r9:8 = combine(##0x7ff00000,#0)
+  r3 = extractu(r28,#2,#22)
+  r28 = or(r28,#0x28)
+ }
+ {
+  USR = r28
+  r3 ^= lsr(r1,#31)
+  r2 = r3
+ }
+ {
+  p0 = !cmp.eq(r2,#1)
+  p0 = !cmp.eq(r3,#2)
+ }
+ {
+  p0 = dfcmp.eq(r9:8,r9:8)
+  if (p0.new) r11:10 = r9:8
+ }
+ {
+  r1:0 = insert(r11:10,#63,#0)
+  dealloc_return
+ }
+.Lfma_abnormal_ab:
+ {
+  r9:8 = extractu(r1:0,#63,#0)
+  r11:10 = extractu(r3:2,#63,#0)
+  deallocframe
+ }
+ {
+  p3 = cmp.gtu(r9:8,r11:10)
+  if (!p3.new) r1:0 = r3:2
+  if (!p3.new) r3:2 = r1:0
+ }
+ {
+  p0 = dfclass(r1:0,#0x0f)
+  if (!p0.new) jump:nt .Lnan
+  if (!p3) r9:8 = r11:10
+  if (!p3) r11:10 = r9:8
+ }
+ {
+  p1 = dfclass(r1:0,#0x08)
+  p1 = dfclass(r3:2,#0x0e)
+ }
+ {
+  p0 = dfclass(r1:0,#0x08)
+  p0 = dfclass(r3:2,#0x01)
+ }
+ {
+  if (p1) jump .Lab_inf
+  p2 = dfclass(r3:2,#0x01)
+ }
+ {
+  if (p0) jump .Linvalid
+  if (p2) jump .Lab_true_zero
+  r28 = ##0x7c000000
+ }
+
+
+
+
+
+ {
+  p0 = bitsclr(r1,r28)
+  if (p0.new) jump:nt .Lfma_ab_tiny
+ }
+ {
+  r28 = add(clb(r11:10),#-11)
+ }
+ {
+  r11:10 = asl(r11:10,r28)
+ }
+ {
+  r3:2 = insert(r11:10,#63,#0)
+  r1 -= asl(r28,#20)
+ }
+ jump fma
+
+.Lfma_ab_tiny:
+ r9:8 = combine(##0x00100000,#0)
+ {
+  r1:0 = insert(r9:8,#63,#0)
+  r3:2 = insert(r9:8,#63,#0)
+ }
+ jump fma
+
+.Lab_inf:
+ {
+  r3:2 = lsr(r3:2,#63)
+  p0 = dfclass(r5:4,#0x10)
+ }
+ {
+  r1:0 ^= asl(r3:2,#63)
+  if (p0) jump .Lnan
+ }
+ {
+  p1 = dfclass(r5:4,#0x08)
+  if (p1.new) jump:nt .Lfma_inf_plus_inf
+ }
+
+ {
+  jumpr r31
+ }
+ .falign
+.Lfma_inf_plus_inf:
+ {
+  p0 = dfcmp.eq(r1:0,r5:4)
+  if (!p0.new) jump:nt .Linvalid
+ }
+ {
+  jumpr r31
+ }
+
+.Lnan:
+ {
+  p0 = dfclass(r3:2,#0x10)
+  p1 = dfclass(r5:4,#0x10)
+  if (!p0.new) r3:2 = r1:0
+  if (!p1.new) r5:4 = r1:0
+ }
+ {
+  r3 = convert_df2sf(r3:2)
+  r2 = convert_df2sf(r5:4)
+ }
+ {
+  r3 = convert_df2sf(r1:0)
+  r1:0 = #-1
+  jumpr r31
+ }
+
+.Linvalid:
+ {
+  r28 = ##0x7f800001
+ }
+ {
+  r1:0 = convert_sf2df(r28)
+  jumpr r31
+ }
+
+.Lab_true_zero:
+
+ {
+  p0 = dfclass(r5:4,#0x10)
+  if (p0.new) jump:nt .Lnan
+  if (p0.new) r1:0 = r5:4
+ }
+ {
+  p0 = dfcmp.eq(r3:2,r5:4)
+  r1 = lsr(r1,#31)
+ }
+ {
+  r3 ^= asl(r1,#31)
+  if (!p0) r1:0 = r5:4
+  if (!p0) jumpr r31
+ }
+
+ {
+  p0 = cmp.eq(r3:2,r5:4)
+  if (p0.new) jumpr:t r31
+  r1:0 = r3:2
+ }
+ {
+  r28 = USR
+ }
+ {
+  r28 = extractu(r28,#2,#22)
+  r1:0 = #0
+ }
+ {
+  p0 = cmp.eq(r28,#2)
+  if (p0.new) r1 = ##0x80000000
+  jumpr r31
+ }
+
+
+
+
+ .falign
+.Lfma_abnormal_c:
+
+
+ {
+  p0 = dfclass(r5:4,#0x10)
+  if (p0.new) jump:nt .Lnan
+  if (p0.new) r1:0 = r5:4
+  deallocframe
+ }
+ {
+  p0 = dfclass(r5:4,#0x08)
+  if (p0.new) r1:0 = r5:4
+  if (p0.new) jumpr:nt r31
+ }
+
+
+ {
+  p0 = dfclass(r5:4,#0x01)
+  if (p0.new) jump:nt __hexagon_muldf3
+  r28 = #1
+ }
+
+
+ {
+  allocframe(#32)
+  r11:10 = #0
+  r5 = insert(r28,#11,#20)
+  jump .Lfma_abnormal_c_restart
+ }
+.size fma,.-fma
--- a/library/compiler-builtins/compiler-builtins/src/hexagon/dfminmax.s
+++ b/library/compiler-builtins/compiler-builtins/src/hexagon/dfminmax.s
@ -0,0 +1,45 @@
+ .text
+ .global __hexagon_mindf3
+ .global __hexagon_maxdf3
+ .type __hexagon_mindf3,@function
+ .type __hexagon_maxdf3,@function
+ .global __qdsp_mindf3 ; .set __qdsp_mindf3, __hexagon_mindf3
+ .global __qdsp_maxdf3 ; .set __qdsp_maxdf3, __hexagon_maxdf3
+ .p2align 5
+__hexagon_mindf3:
+ {
+  p0 = dfclass(r1:0,#0x10)
+  p1 = dfcmp.gt(r1:0,r3:2)
+  r5:4 = r1:0
+ }
+ {
+  if (p0) r1:0 = r3:2
+  if (p1) r1:0 = r3:2
+  p2 = dfcmp.eq(r1:0,r3:2)
+  if (!p2.new) jumpr:t r31
+ }
+
+ {
+  r1:0 = or(r5:4,r3:2)
+  jumpr r31
+ }
+.size __hexagon_mindf3,.-__hexagon_mindf3
+ .falign
+__hexagon_maxdf3:
+ {
+  p0 = dfclass(r1:0,#0x10)
+  p1 = dfcmp.gt(r3:2,r1:0)
+  r5:4 = r1:0
+ }
+ {
+  if (p0) r1:0 = r3:2
+  if (p1) r1:0 = r3:2
+  p2 = dfcmp.eq(r1:0,r3:2)
+  if (!p2.new) jumpr:t r31
+ }
+
+ {
+  r1:0 = and(r5:4,r3:2)
+  jumpr r31
+ }
+.size __hexagon_maxdf3,.-__hexagon_maxdf3
--- a/Show more
+++ b/Show more