Remove the need to call clang for std::offload usages

2025-12-09 07:50:30 -08:00 · 2025-12-09 07:50:30 -08:00 · dfef2e96fe
commit dfef2e96fe
parent 0ac9e59d8f
8 changed files with 149 additions and 30 deletions
--- a/compiler/rustc_codegen_llvm/src/back/write.rs
+++ b/compiler/rustc_codegen_llvm/src/back/write.rs
@ -703,10 +703,9 @@ pub(crate) unsafe fn llvm_optimize(
        llvm::set_value_name(new_fn, &name);
    }

-    if cgcx.target_is_like_gpu && config.offload.contains(&config::Offload::Enable) {
+    if cgcx.target_is_like_gpu && config.offload.contains(&config::Offload::Device) {
        let cx =
            SimpleCx::new(module.module_llvm.llmod(), module.module_llvm.llcx, cgcx.pointer_size);
-
        for func in cx.get_functions() {
            let offload_kernel = "offload-kernel";
            if attributes::has_string_attr(func, offload_kernel) {
@ -775,12 +774,79 @@ pub(crate) unsafe fn llvm_optimize(
        )
    };

-    if cgcx.target_is_like_gpu && config.offload.contains(&config::Offload::Enable) {
+    if cgcx.target_is_like_gpu && config.offload.contains(&config::Offload::Device) {
+        let device_path = cgcx.output_filenames.path(OutputType::Object);
+        let device_dir = device_path.parent().unwrap();
+        let device_out = device_dir.join("host.out");
+        let device_out_c = path_to_c_string(device_out.as_path());
        unsafe {
-            llvm::LLVMRustBundleImages(module.module_llvm.llmod(), module.module_llvm.tm.raw());
+            // 1) Bundle device module into offload image host.out (device TM)
+            let ok = llvm::LLVMRustBundleImages(
+                module.module_llvm.llmod(),
+                module.module_llvm.tm.raw(),
+                device_out_c.as_ptr(),
+            );
+            assert!(ok, "LLVMRustBundleImages (device -> host.out) failed");
+            if !device_out.exists() {
+                panic!("BundleImages failed, `host.out` was not created!");
+            }
        }
    }

+    // This assumes that we previously compiled our kernels for a gpu target, which created a
+    // `host.out` artifact. The user is supposed to provide us with a path to this artifact, we
+    // don't need any other artifacts from the previous run. We will embed this artifact into our
+    // LLVM-IR host module, to create a `host.o` ObjectFile, which we will write to disk.
+    // The last, not yet automated steps uses the `clang-linker-wrapper` to process `host.o`.
+    if !cgcx.target_is_like_gpu {
+        if let Some(device_path) = config
+            .offload
+            .iter()
+            .find_map(|o| if let config::Offload::Host(path) = o { Some(path) } else { None })
+        {
+            let device_pathbuf = PathBuf::from(device_path);
+            if device_pathbuf.is_relative() {
+                panic!("Absolute path is needed");
+            } else if device_pathbuf
+                .file_name()
+                .and_then(|n| n.to_str())
+                .is_some_and(|n| n != "host.out")
+            {
+                panic!("Need path to the host.out file");
+            }
+            assert!(device_pathbuf.exists());
+            let host_path = cgcx.output_filenames.path(OutputType::Object);
+            let host_dir = host_path.parent().unwrap();
+            let out_obj = host_dir.join("host.o");
+            let host_out_c = path_to_c_string(device_pathbuf.as_path());
+
+            // 2) Finalize host: lib.bc + host.out -> host.o (host TM)
+            // We create a full clone of our LLVM host module, since we will embed the device IR
+            // into it, and this might break caching or incremental compilation otherwise.
+            let llmod2 = llvm::LLVMCloneModule(module.module_llvm.llmod());
+            let ok =
+                unsafe { llvm::LLVMRustOffloadEmbedBufferInModule(llmod2, host_out_c.as_ptr()) };
+            assert!(ok, "LLVMRustOffloadEmbedBufferInModule failed");
+            write_output_file(
+                dcx,
+                module.module_llvm.tm.raw(),
+                config.no_builtins,
+                llmod2,
+                &out_obj,
+                None,
+                llvm::FileType::ObjectFile,
+                &cgcx.prof,
+                true,
+            );
+            if !out_obj.exists() {
+                dbg!("{:?} does not exist!", out_obj);
+                panic!("FinalizeOffload failed!");
+            }
+            // We ignore cgcx.save_temps here and unconditionally always keep our `host.out` artifact.
+            // Otherwise, recompiling the host code would fail since we deleted that device artifact
+            // in the previous host compilation, which would be confusing at best.
+        }
+    }
    result.into_result().unwrap_or_else(|()| llvm_err(dcx, LlvmError::RunLlvmPasses))
 }

--- a/compiler/rustc_codegen_llvm/src/base.rs
+++ b/compiler/rustc_codegen_llvm/src/base.rs
@ -93,9 +93,9 @@ pub(crate) fn compile_codegen_unit(
            // They are necessary for correct offload execution. We do this here to simplify the
            // `offload` intrinsic, avoiding the need for tracking whether it's the first
            // intrinsic call or not.
-            if cx.sess().opts.unstable_opts.offload.contains(&Offload::Enable)
-                && !cx.sess().target.is_like_gpu
-            {
+            let has_host_offload =
+                cx.sess().opts.unstable_opts.offload.iter().any(|o| matches!(o, Offload::Host(_)));
+            if has_host_offload && !cx.sess().target.is_like_gpu {
                cx.offload_globals.replace(Some(OffloadGlobals::declare(&cx)));
            }

--- a/compiler/rustc_codegen_llvm/src/intrinsic.rs
+++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs
@ -202,13 +202,7 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
                return Ok(());
            }
            sym::offload => {
-                if !tcx
-                    .sess
-                    .opts
-                    .unstable_opts
-                    .offload
-                    .contains(&rustc_session::config::Offload::Enable)
-                {
+                if tcx.sess.opts.unstable_opts.offload.is_empty() {
                    let _ = tcx.dcx().emit_almost_fatal(OffloadWithoutEnable);
                }

--- a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
+++ b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
@ -1666,7 +1666,15 @@ mod Offload {
    use super::*;
    unsafe extern "C" {
        /// Processes the module and writes it in an offload compatible way into a "host.out" file.
-        pub(crate) fn LLVMRustBundleImages<'a>(M: &'a Module, TM: &'a TargetMachine) -> bool;
+        pub(crate) fn LLVMRustBundleImages<'a>(
+            M: &'a Module,
+            TM: &'a TargetMachine,
+            host_out: *const c_char,
+        ) -> bool;
+        pub(crate) unsafe fn LLVMRustOffloadEmbedBufferInModule<'a>(
+            _M: &'a Module,
+            _host_out: *const c_char,
+        ) -> bool;
        pub(crate) fn LLVMRustOffloadMapper<'a>(OldFn: &'a Value, NewFn: &'a Value);
    }
 }
@ -1680,7 +1688,17 @@ mod Offload_fallback {
    /// Processes the module and writes it in an offload compatible way into a "host.out" file.
    /// Marked as unsafe to match the real offload wrapper which is unsafe due to FFI.
    #[allow(unused_unsafe)]
-    pub(crate) unsafe fn LLVMRustBundleImages<'a>(_M: &'a Module, _TM: &'a TargetMachine) -> bool {
+    pub(crate) unsafe fn LLVMRustBundleImages<'a>(
+        _M: &'a Module,
+        _TM: &'a TargetMachine,
+        _host_out: *const c_char,
+    ) -> bool {
+        unimplemented!("This rustc version was not built with LLVM Offload support!");
+    }
+    pub(crate) unsafe fn LLVMRustOffloadEmbedBufferInModule<'a>(
+        _M: &'a Module,
+        _host_out: *const c_char,
+    ) -> bool {
        unimplemented!("This rustc version was not built with LLVM Offload support!");
    }
    #[allow(unused_unsafe)]
--- a/compiler/rustc_interface/src/tests.rs
+++ b/compiler/rustc_interface/src/tests.rs
@ -837,7 +837,7 @@ fn test_unstable_options_tracking_hash() {
    tracked!(no_profiler_runtime, true);
    tracked!(no_trait_vptr, true);
    tracked!(no_unique_section_names, true);
-    tracked!(offload, vec![Offload::Enable]);
+    tracked!(offload, vec![Offload::Device]);
    tracked!(on_broken_pipe, OnBrokenPipe::Kill);
    tracked!(osx_rpath_install_name, true);
    tracked!(packed_bundled_libs, true);
--- a/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp
+++ b/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp
@ -43,8 +43,10 @@
 // available. As such, we only try to build it in the first place, if
 // llvm.offload is enabled.
 #ifdef OFFLOAD
+#include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Object/OffloadBinary.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
 #endif

 // for raw `write` in the bad-alloc handler
@ -174,12 +176,13 @@ static Error writeFile(StringRef Filename, StringRef Data) {
 //  --image=file=device.bc,triple=amdgcn-amd-amdhsa,arch=gfx90a,kind=openmp
 // The input module is the rust code compiled for a gpu target like amdgpu.
 // Based on clang/tools/clang-offload-packager/ClangOffloadPackager.cpp
-extern "C" bool LLVMRustBundleImages(LLVMModuleRef M, TargetMachine &TM) {
+extern "C" bool LLVMRustBundleImages(LLVMModuleRef M, TargetMachine &TM,
+                                     const char *HostOutPath) {
  std::string Storage;
  llvm::raw_string_ostream OS1(Storage);
  llvm::WriteBitcodeToFile(*unwrap(M), OS1);
  OS1.flush();
-  auto MB = llvm::MemoryBuffer::getMemBufferCopy(Storage, "module.bc");
+  auto MB = llvm::MemoryBuffer::getMemBufferCopy(Storage, "device.bc");

  SmallVector<char, 1024> BinaryData;
  raw_svector_ostream OS2(BinaryData);
@ -188,19 +191,38 @@ extern "C" bool LLVMRustBundleImages(LLVMModuleRef M, TargetMachine &TM) {
  ImageBinary.TheImageKind = object::IMG_Bitcode;
  ImageBinary.Image = std::move(MB);
  ImageBinary.TheOffloadKind = object::OFK_OpenMP;
-  ImageBinary.StringData["triple"] = TM.getTargetTriple().str();
-  ImageBinary.StringData["arch"] = TM.getTargetCPU();
+
+  std::string TripleStr = TM.getTargetTriple().str();
+  llvm::StringRef CPURef = TM.getTargetCPU();
+  ImageBinary.StringData["triple"] = TripleStr;
+  ImageBinary.StringData["arch"] = CPURef;
  llvm::SmallString<0> Buffer = OffloadBinary::write(ImageBinary);
  if (Buffer.size() % OffloadBinary::getAlignment() != 0)
    // Offload binary has invalid size alignment
    return false;
  OS2 << Buffer;
-  if (Error E = writeFile("host.out",
+  if (Error E = writeFile(HostOutPath,
                          StringRef(BinaryData.begin(), BinaryData.size())))
    return false;
  return true;
 }

+extern "C" bool LLVMRustOffloadEmbedBufferInModule(LLVMModuleRef HostM,
+                                                   const char *HostOutPath) {
+  auto MBOrErr = MemoryBuffer::getFile(HostOutPath);
+  if (!MBOrErr) {
+    auto E = MBOrErr.getError();
+    auto _B = errorCodeToError(E);
+    return false;
+  }
+  MemoryBufferRef Buf = (*MBOrErr)->getMemBufferRef();
+  Module *M = unwrap(HostM);
+  StringRef SectionName = ".llvm.offloading";
+  Align Alignment = Align(8);
+  llvm::embedBufferInModule(*M, Buf, SectionName, Alignment);
+  return true;
+}
+
 extern "C" void LLVMRustOffloadMapper(LLVMValueRef OldFn, LLVMValueRef NewFn) {
  llvm::Function *oldFn = llvm::unwrap<llvm::Function>(OldFn);
  llvm::Function *newFn = llvm::unwrap<llvm::Function>(NewFn);
--- a/compiler/rustc_session/src/config.rs
+++ b/compiler/rustc_session/src/config.rs
@ -190,10 +190,12 @@ pub enum CoverageLevel {
 }

 // The different settings that the `-Z offload` flag can have.
-#[derive(Clone, Copy, PartialEq, Hash, Debug)]
+#[derive(Clone, PartialEq, Hash, Debug)]
 pub enum Offload {
-    /// Enable the llvm offload pipeline
-    Enable,
+    /// Entry point for `std::offload`, enables kernel compilation for a gpu device
+    Device,
+    /// Second step in the offload pipeline, generates the host code to call kernels.
+    Host(String),
 }

 /// The different settings that the `-Z autodiff` flag can have.
@ -2578,9 +2580,7 @@ pub fn build_session_options(early_dcx: &mut EarlyDiagCtxt, matches: &getopts::M
        )
    }

-    if !nightly_options::is_unstable_enabled(matches)
-        && unstable_opts.offload.contains(&Offload::Enable)
-    {
+    if !nightly_options::is_unstable_enabled(matches) && !unstable_opts.offload.is_empty() {
        early_dcx.early_fatal(
            "`-Zoffload=Enable` also requires `-Zunstable-options` \
                and a nightly compiler",
--- a/compiler/rustc_session/src/options.rs
+++ b/compiler/rustc_session/src/options.rs
@ -1451,8 +1451,27 @@ pub mod parse {
        let mut v: Vec<&str> = v.split(",").collect();
        v.sort_unstable();
        for &val in v.iter() {
-            let variant = match val {
-                "Enable" => Offload::Enable,
+            // Split each entry on '=' if it has an argument
+            let (key, arg) = match val.split_once('=') {
+                Some((k, a)) => (k, Some(a)),
+                None => (val, None),
+            };
+
+            let variant = match key {
+                "Host" => {
+                    if let Some(p) = arg {
+                        Offload::Host(p.to_string())
+                    } else {
+                        return false;
+                    }
+                }
+                "Device" => {
+                    if let Some(_) = arg {
+                        // Device does not accept a value
+                        return false;
+                    }
+                    Offload::Device
+                }
                _ => {
                    // FIXME(ZuseZ4): print an error saying which value is not recognized
                    return false;