From 48504415cff11ac421bce701be4c0188afed77ef Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Tue, 8 Oct 2024 17:26:42 -0700
Subject: [PATCH 1/2] cg_clif: Factor out rustc_target::abi

---
 src/discriminant.rs | 3 ++-
 src/lib.rs          | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/discriminant.rs b/src/discriminant.rs
index e7ac084558a5..d462dcd63a92 100644
--- a/src/discriminant.rs
+++ b/src/discriminant.rs
@@ -3,7 +3,8 @@
 //! Adapted from <https://github.com/rust-lang/rust/blob/31c0645b9d2539f47eecb096142474b29dc542f7/compiler/rustc_codegen_ssa/src/mir/place.rs>
 //! (<https://github.com/rust-lang/rust/pull/104535>)
 
-use rustc_target::abi::{Int, TagEncoding, Variants};
+use rustc_abi::Primitive::Int;
+use rustc_abi::{TagEncoding, Variants};
 
 use crate::prelude::*;
 
diff --git a/src/lib.rs b/src/lib.rs
index f6b7981395a5..b6f9ce8fc298 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -15,6 +15,7 @@
 extern crate jobserver;
 #[macro_use]
 extern crate rustc_middle;
+extern crate rustc_abi;
 extern crate rustc_ast;
 extern crate rustc_codegen_ssa;
 extern crate rustc_data_structures;

From 270fb2130c979fe1d1e520dbb8474aa13257d16b Mon Sep 17 00:00:00 2001
From: Jed Brown <jed@jedbrown.org>
Date: Fri, 5 Jan 2024 21:04:41 -0700
Subject: [PATCH 2/2] intrinsics.fmuladdf{16,32,64,128}: expose llvm.fmuladd.*
 semantics

Add intrinsics `fmuladd{f16,f32,f64,f128}`. This computes `(a * b) +
c`, to be fused if the code generator determines that (i) the target
instruction set has support for a fused operation, and (ii) that the
fused operation is more efficient than the equivalent, separate pair
of `mul` and `add` instructions.

https://llvm.org/docs/LangRef.html#llvm-fmuladd-intrinsic

MIRI support is included for f32 and f64.

The codegen_cranelift uses the `fma` function from libc, which is a
correct implementation, but without the desired performance semantic. I
think this requires an update to cranelift to expose a suitable
instruction in its IR.

I have not tested with codegen_gcc, but it should behave the same
way (using `fma` from libc).
---
 src/intrinsics/mod.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/intrinsics/mod.rs b/src/intrinsics/mod.rs
index 19e5adc25385..35f0ccff3f99 100644
--- a/src/intrinsics/mod.rs
+++ b/src/intrinsics/mod.rs
@@ -328,6 +328,9 @@ fn codegen_float_intrinsic_call<'tcx>(
         sym::fabsf64 => ("fabs", 1, fx.tcx.types.f64, types::F64),
         sym::fmaf32 => ("fmaf", 3, fx.tcx.types.f32, types::F32),
         sym::fmaf64 => ("fma", 3, fx.tcx.types.f64, types::F64),
+        // FIXME: calling `fma` from libc without FMA target feature uses expensive sofware emulation
+        sym::fmuladdf32 => ("fmaf", 3, fx.tcx.types.f32, types::F32), // TODO: use cranelift intrinsic analogous to llvm.fmuladd.f32
+        sym::fmuladdf64 => ("fma", 3, fx.tcx.types.f64, types::F64), // TODO: use cranelift intrinsic analogous to llvm.fmuladd.f64
         sym::copysignf32 => ("copysignf", 2, fx.tcx.types.f32, types::F32),
         sym::copysignf64 => ("copysign", 2, fx.tcx.types.f64, types::F64),
         sym::floorf32 => ("floorf", 1, fx.tcx.types.f32, types::F32),
@@ -381,7 +384,7 @@ fn codegen_float_intrinsic_call<'tcx>(
 
     let layout = fx.layout_of(ty);
     let res = match intrinsic {
-        sym::fmaf32 | sym::fmaf64 => {
+        sym::fmaf32 | sym::fmaf64 | sym::fmuladdf32 | sym::fmuladdf64 => {
             CValue::by_val(fx.bcx.ins().fma(args[0], args[1], args[2]), layout)
         }
         sym::copysignf32 | sym::copysignf64 => {