diff --git a/compiler/rustc_codegen_llvm/src/back/write.rs b/compiler/rustc_codegen_llvm/src/back/write.rs index e8da7f68136d..a649ee4bff1c 100644 --- a/compiler/rustc_codegen_llvm/src/back/write.rs +++ b/compiler/rustc_codegen_llvm/src/back/write.rs @@ -703,10 +703,9 @@ pub(crate) unsafe fn llvm_optimize( llvm::set_value_name(new_fn, &name); } - if cgcx.target_is_like_gpu && config.offload.contains(&config::Offload::Enable) { + if cgcx.target_is_like_gpu && config.offload.contains(&config::Offload::Device) { let cx = SimpleCx::new(module.module_llvm.llmod(), module.module_llvm.llcx, cgcx.pointer_size); - for func in cx.get_functions() { let offload_kernel = "offload-kernel"; if attributes::has_string_attr(func, offload_kernel) { @@ -775,12 +774,79 @@ pub(crate) unsafe fn llvm_optimize( ) }; - if cgcx.target_is_like_gpu && config.offload.contains(&config::Offload::Enable) { + if cgcx.target_is_like_gpu && config.offload.contains(&config::Offload::Device) { + let device_path = cgcx.output_filenames.path(OutputType::Object); + let device_dir = device_path.parent().unwrap(); + let device_out = device_dir.join("host.out"); + let device_out_c = path_to_c_string(device_out.as_path()); unsafe { - llvm::LLVMRustBundleImages(module.module_llvm.llmod(), module.module_llvm.tm.raw()); + // 1) Bundle device module into offload image host.out (device TM) + let ok = llvm::LLVMRustBundleImages( + module.module_llvm.llmod(), + module.module_llvm.tm.raw(), + device_out_c.as_ptr(), + ); + assert!(ok, "LLVMRustBundleImages (device -> host.out) failed"); + if !device_out.exists() { + panic!("BundleImages failed, `host.out` was not created!"); + } } } + // This assumes that we previously compiled our kernels for a gpu target, which created a + // `host.out` artifact. The user is supposed to provide us with a path to this artifact, we + // don't need any other artifacts from the previous run. We will embed this artifact into our + // LLVM-IR host module, to create a `host.o` ObjectFile, which we will write to disk. + // The last, not yet automated steps uses the `clang-linker-wrapper` to process `host.o`. + if !cgcx.target_is_like_gpu { + if let Some(device_path) = config + .offload + .iter() + .find_map(|o| if let config::Offload::Host(path) = o { Some(path) } else { None }) + { + let device_pathbuf = PathBuf::from(device_path); + if device_pathbuf.is_relative() { + panic!("Absolute path is needed"); + } else if device_pathbuf + .file_name() + .and_then(|n| n.to_str()) + .is_some_and(|n| n != "host.out") + { + panic!("Need path to the host.out file"); + } + assert!(device_pathbuf.exists()); + let host_path = cgcx.output_filenames.path(OutputType::Object); + let host_dir = host_path.parent().unwrap(); + let out_obj = host_dir.join("host.o"); + let host_out_c = path_to_c_string(device_pathbuf.as_path()); + + // 2) Finalize host: lib.bc + host.out -> host.o (host TM) + // We create a full clone of our LLVM host module, since we will embed the device IR + // into it, and this might break caching or incremental compilation otherwise. + let llmod2 = llvm::LLVMCloneModule(module.module_llvm.llmod()); + let ok = + unsafe { llvm::LLVMRustOffloadEmbedBufferInModule(llmod2, host_out_c.as_ptr()) }; + assert!(ok, "LLVMRustOffloadEmbedBufferInModule failed"); + write_output_file( + dcx, + module.module_llvm.tm.raw(), + config.no_builtins, + llmod2, + &out_obj, + None, + llvm::FileType::ObjectFile, + &cgcx.prof, + true, + ); + if !out_obj.exists() { + dbg!("{:?} does not exist!", out_obj); + panic!("FinalizeOffload failed!"); + } + // We ignore cgcx.save_temps here and unconditionally always keep our `host.out` artifact. + // Otherwise, recompiling the host code would fail since we deleted that device artifact + // in the previous host compilation, which would be confusing at best. + } + } result.into_result().unwrap_or_else(|()| llvm_err(dcx, LlvmError::RunLlvmPasses)) } diff --git a/compiler/rustc_codegen_llvm/src/base.rs b/compiler/rustc_codegen_llvm/src/base.rs index 16455b4c79cd..388118f9b4f1 100644 --- a/compiler/rustc_codegen_llvm/src/base.rs +++ b/compiler/rustc_codegen_llvm/src/base.rs @@ -93,9 +93,9 @@ pub(crate) fn compile_codegen_unit( // They are necessary for correct offload execution. We do this here to simplify the // `offload` intrinsic, avoiding the need for tracking whether it's the first // intrinsic call or not. - if cx.sess().opts.unstable_opts.offload.contains(&Offload::Enable) - && !cx.sess().target.is_like_gpu - { + let has_host_offload = + cx.sess().opts.unstable_opts.offload.iter().any(|o| matches!(o, Offload::Host(_))); + if has_host_offload && !cx.sess().target.is_like_gpu { cx.offload_globals.replace(Some(OffloadGlobals::declare(&cx))); } diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs index 3bc890310cc8..f3d919207477 100644 --- a/compiler/rustc_codegen_llvm/src/intrinsic.rs +++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs @@ -202,13 +202,7 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> { return Ok(()); } sym::offload => { - if !tcx - .sess - .opts - .unstable_opts - .offload - .contains(&rustc_session::config::Offload::Enable) - { + if tcx.sess.opts.unstable_opts.offload.is_empty() { let _ = tcx.dcx().emit_almost_fatal(OffloadWithoutEnable); } diff --git a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs index a86b4cc38915..75b3e5955b78 100644 --- a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs +++ b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs @@ -1666,7 +1666,15 @@ mod Offload { use super::*; unsafe extern "C" { /// Processes the module and writes it in an offload compatible way into a "host.out" file. - pub(crate) fn LLVMRustBundleImages<'a>(M: &'a Module, TM: &'a TargetMachine) -> bool; + pub(crate) fn LLVMRustBundleImages<'a>( + M: &'a Module, + TM: &'a TargetMachine, + host_out: *const c_char, + ) -> bool; + pub(crate) unsafe fn LLVMRustOffloadEmbedBufferInModule<'a>( + _M: &'a Module, + _host_out: *const c_char, + ) -> bool; pub(crate) fn LLVMRustOffloadMapper<'a>(OldFn: &'a Value, NewFn: &'a Value); } } @@ -1680,7 +1688,17 @@ mod Offload_fallback { /// Processes the module and writes it in an offload compatible way into a "host.out" file. /// Marked as unsafe to match the real offload wrapper which is unsafe due to FFI. #[allow(unused_unsafe)] - pub(crate) unsafe fn LLVMRustBundleImages<'a>(_M: &'a Module, _TM: &'a TargetMachine) -> bool { + pub(crate) unsafe fn LLVMRustBundleImages<'a>( + _M: &'a Module, + _TM: &'a TargetMachine, + _host_out: *const c_char, + ) -> bool { + unimplemented!("This rustc version was not built with LLVM Offload support!"); + } + pub(crate) unsafe fn LLVMRustOffloadEmbedBufferInModule<'a>( + _M: &'a Module, + _host_out: *const c_char, + ) -> bool { unimplemented!("This rustc version was not built with LLVM Offload support!"); } #[allow(unused_unsafe)] diff --git a/compiler/rustc_interface/src/tests.rs b/compiler/rustc_interface/src/tests.rs index 8dab3a7f37f5..d075f94ef850 100644 --- a/compiler/rustc_interface/src/tests.rs +++ b/compiler/rustc_interface/src/tests.rs @@ -837,7 +837,7 @@ fn test_unstable_options_tracking_hash() { tracked!(no_profiler_runtime, true); tracked!(no_trait_vptr, true); tracked!(no_unique_section_names, true); - tracked!(offload, vec![Offload::Enable]); + tracked!(offload, vec![Offload::Device]); tracked!(on_broken_pipe, OnBrokenPipe::Kill); tracked!(osx_rpath_install_name, true); tracked!(packed_bundled_libs, true); diff --git a/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp b/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp index 0720af0eb7e0..02e6abf24627 100644 --- a/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp +++ b/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp @@ -43,8 +43,10 @@ // available. As such, we only try to build it in the first place, if // llvm.offload is enabled. #ifdef OFFLOAD +#include "llvm/Bitcode/BitcodeReader.h" #include "llvm/Object/OffloadBinary.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" #endif // for raw `write` in the bad-alloc handler @@ -174,12 +176,13 @@ static Error writeFile(StringRef Filename, StringRef Data) { // --image=file=device.bc,triple=amdgcn-amd-amdhsa,arch=gfx90a,kind=openmp // The input module is the rust code compiled for a gpu target like amdgpu. // Based on clang/tools/clang-offload-packager/ClangOffloadPackager.cpp -extern "C" bool LLVMRustBundleImages(LLVMModuleRef M, TargetMachine &TM) { +extern "C" bool LLVMRustBundleImages(LLVMModuleRef M, TargetMachine &TM, + const char *HostOutPath) { std::string Storage; llvm::raw_string_ostream OS1(Storage); llvm::WriteBitcodeToFile(*unwrap(M), OS1); OS1.flush(); - auto MB = llvm::MemoryBuffer::getMemBufferCopy(Storage, "module.bc"); + auto MB = llvm::MemoryBuffer::getMemBufferCopy(Storage, "device.bc"); SmallVector BinaryData; raw_svector_ostream OS2(BinaryData); @@ -188,19 +191,38 @@ extern "C" bool LLVMRustBundleImages(LLVMModuleRef M, TargetMachine &TM) { ImageBinary.TheImageKind = object::IMG_Bitcode; ImageBinary.Image = std::move(MB); ImageBinary.TheOffloadKind = object::OFK_OpenMP; - ImageBinary.StringData["triple"] = TM.getTargetTriple().str(); - ImageBinary.StringData["arch"] = TM.getTargetCPU(); + + std::string TripleStr = TM.getTargetTriple().str(); + llvm::StringRef CPURef = TM.getTargetCPU(); + ImageBinary.StringData["triple"] = TripleStr; + ImageBinary.StringData["arch"] = CPURef; llvm::SmallString<0> Buffer = OffloadBinary::write(ImageBinary); if (Buffer.size() % OffloadBinary::getAlignment() != 0) // Offload binary has invalid size alignment return false; OS2 << Buffer; - if (Error E = writeFile("host.out", + if (Error E = writeFile(HostOutPath, StringRef(BinaryData.begin(), BinaryData.size()))) return false; return true; } +extern "C" bool LLVMRustOffloadEmbedBufferInModule(LLVMModuleRef HostM, + const char *HostOutPath) { + auto MBOrErr = MemoryBuffer::getFile(HostOutPath); + if (!MBOrErr) { + auto E = MBOrErr.getError(); + auto _B = errorCodeToError(E); + return false; + } + MemoryBufferRef Buf = (*MBOrErr)->getMemBufferRef(); + Module *M = unwrap(HostM); + StringRef SectionName = ".llvm.offloading"; + Align Alignment = Align(8); + llvm::embedBufferInModule(*M, Buf, SectionName, Alignment); + return true; +} + extern "C" void LLVMRustOffloadMapper(LLVMValueRef OldFn, LLVMValueRef NewFn) { llvm::Function *oldFn = llvm::unwrap(OldFn); llvm::Function *newFn = llvm::unwrap(NewFn); diff --git a/compiler/rustc_session/src/config.rs b/compiler/rustc_session/src/config.rs index a3a97dfec61d..2774333573f6 100644 --- a/compiler/rustc_session/src/config.rs +++ b/compiler/rustc_session/src/config.rs @@ -190,10 +190,12 @@ pub enum CoverageLevel { } // The different settings that the `-Z offload` flag can have. -#[derive(Clone, Copy, PartialEq, Hash, Debug)] +#[derive(Clone, PartialEq, Hash, Debug)] pub enum Offload { - /// Enable the llvm offload pipeline - Enable, + /// Entry point for `std::offload`, enables kernel compilation for a gpu device + Device, + /// Second step in the offload pipeline, generates the host code to call kernels. + Host(String), } /// The different settings that the `-Z autodiff` flag can have. @@ -2578,9 +2580,7 @@ pub fn build_session_options(early_dcx: &mut EarlyDiagCtxt, matches: &getopts::M ) } - if !nightly_options::is_unstable_enabled(matches) - && unstable_opts.offload.contains(&Offload::Enable) - { + if !nightly_options::is_unstable_enabled(matches) && !unstable_opts.offload.is_empty() { early_dcx.early_fatal( "`-Zoffload=Enable` also requires `-Zunstable-options` \ and a nightly compiler", diff --git a/compiler/rustc_session/src/options.rs b/compiler/rustc_session/src/options.rs index aea0b73ee927..2b83d1225c97 100644 --- a/compiler/rustc_session/src/options.rs +++ b/compiler/rustc_session/src/options.rs @@ -1451,8 +1451,27 @@ pub mod parse { let mut v: Vec<&str> = v.split(",").collect(); v.sort_unstable(); for &val in v.iter() { - let variant = match val { - "Enable" => Offload::Enable, + // Split each entry on '=' if it has an argument + let (key, arg) = match val.split_once('=') { + Some((k, a)) => (k, Some(a)), + None => (val, None), + }; + + let variant = match key { + "Host" => { + if let Some(p) = arg { + Offload::Host(p.to_string()) + } else { + return false; + } + } + "Device" => { + if let Some(_) = arg { + // Device does not accept a value + return false; + } + Offload::Device + } _ => { // FIXME(ZuseZ4): print an error saying which value is not recognized return false;