Remove the need to call clang for std::offload usages

This commit is contained in:
Manuel Drehwald 2025-12-09 07:50:30 -08:00
parent 0ac9e59d8f
commit dfef2e96fe
8 changed files with 149 additions and 30 deletions

View file

@ -703,10 +703,9 @@ pub(crate) unsafe fn llvm_optimize(
llvm::set_value_name(new_fn, &name);
}
if cgcx.target_is_like_gpu && config.offload.contains(&config::Offload::Enable) {
if cgcx.target_is_like_gpu && config.offload.contains(&config::Offload::Device) {
let cx =
SimpleCx::new(module.module_llvm.llmod(), module.module_llvm.llcx, cgcx.pointer_size);
for func in cx.get_functions() {
let offload_kernel = "offload-kernel";
if attributes::has_string_attr(func, offload_kernel) {
@ -775,12 +774,79 @@ pub(crate) unsafe fn llvm_optimize(
)
};
if cgcx.target_is_like_gpu && config.offload.contains(&config::Offload::Enable) {
if cgcx.target_is_like_gpu && config.offload.contains(&config::Offload::Device) {
let device_path = cgcx.output_filenames.path(OutputType::Object);
let device_dir = device_path.parent().unwrap();
let device_out = device_dir.join("host.out");
let device_out_c = path_to_c_string(device_out.as_path());
unsafe {
llvm::LLVMRustBundleImages(module.module_llvm.llmod(), module.module_llvm.tm.raw());
// 1) Bundle device module into offload image host.out (device TM)
let ok = llvm::LLVMRustBundleImages(
module.module_llvm.llmod(),
module.module_llvm.tm.raw(),
device_out_c.as_ptr(),
);
assert!(ok, "LLVMRustBundleImages (device -> host.out) failed");
if !device_out.exists() {
panic!("BundleImages failed, `host.out` was not created!");
}
}
}
// This assumes that we previously compiled our kernels for a gpu target, which created a
// `host.out` artifact. The user is supposed to provide us with a path to this artifact, we
// don't need any other artifacts from the previous run. We will embed this artifact into our
// LLVM-IR host module, to create a `host.o` ObjectFile, which we will write to disk.
// The last, not yet automated steps uses the `clang-linker-wrapper` to process `host.o`.
if !cgcx.target_is_like_gpu {
if let Some(device_path) = config
.offload
.iter()
.find_map(|o| if let config::Offload::Host(path) = o { Some(path) } else { None })
{
let device_pathbuf = PathBuf::from(device_path);
if device_pathbuf.is_relative() {
panic!("Absolute path is needed");
} else if device_pathbuf
.file_name()
.and_then(|n| n.to_str())
.is_some_and(|n| n != "host.out")
{
panic!("Need path to the host.out file");
}
assert!(device_pathbuf.exists());
let host_path = cgcx.output_filenames.path(OutputType::Object);
let host_dir = host_path.parent().unwrap();
let out_obj = host_dir.join("host.o");
let host_out_c = path_to_c_string(device_pathbuf.as_path());
// 2) Finalize host: lib.bc + host.out -> host.o (host TM)
// We create a full clone of our LLVM host module, since we will embed the device IR
// into it, and this might break caching or incremental compilation otherwise.
let llmod2 = llvm::LLVMCloneModule(module.module_llvm.llmod());
let ok =
unsafe { llvm::LLVMRustOffloadEmbedBufferInModule(llmod2, host_out_c.as_ptr()) };
assert!(ok, "LLVMRustOffloadEmbedBufferInModule failed");
write_output_file(
dcx,
module.module_llvm.tm.raw(),
config.no_builtins,
llmod2,
&out_obj,
None,
llvm::FileType::ObjectFile,
&cgcx.prof,
true,
);
if !out_obj.exists() {
dbg!("{:?} does not exist!", out_obj);
panic!("FinalizeOffload failed!");
}
// We ignore cgcx.save_temps here and unconditionally always keep our `host.out` artifact.
// Otherwise, recompiling the host code would fail since we deleted that device artifact
// in the previous host compilation, which would be confusing at best.
}
}
result.into_result().unwrap_or_else(|()| llvm_err(dcx, LlvmError::RunLlvmPasses))
}

View file

@ -93,9 +93,9 @@ pub(crate) fn compile_codegen_unit(
// They are necessary for correct offload execution. We do this here to simplify the
// `offload` intrinsic, avoiding the need for tracking whether it's the first
// intrinsic call or not.
if cx.sess().opts.unstable_opts.offload.contains(&Offload::Enable)
&& !cx.sess().target.is_like_gpu
{
let has_host_offload =
cx.sess().opts.unstable_opts.offload.iter().any(|o| matches!(o, Offload::Host(_)));
if has_host_offload && !cx.sess().target.is_like_gpu {
cx.offload_globals.replace(Some(OffloadGlobals::declare(&cx)));
}

View file

@ -202,13 +202,7 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
return Ok(());
}
sym::offload => {
if !tcx
.sess
.opts
.unstable_opts
.offload
.contains(&rustc_session::config::Offload::Enable)
{
if tcx.sess.opts.unstable_opts.offload.is_empty() {
let _ = tcx.dcx().emit_almost_fatal(OffloadWithoutEnable);
}

View file

@ -1666,7 +1666,15 @@ mod Offload {
use super::*;
unsafe extern "C" {
/// Processes the module and writes it in an offload compatible way into a "host.out" file.
pub(crate) fn LLVMRustBundleImages<'a>(M: &'a Module, TM: &'a TargetMachine) -> bool;
pub(crate) fn LLVMRustBundleImages<'a>(
M: &'a Module,
TM: &'a TargetMachine,
host_out: *const c_char,
) -> bool;
pub(crate) unsafe fn LLVMRustOffloadEmbedBufferInModule<'a>(
_M: &'a Module,
_host_out: *const c_char,
) -> bool;
pub(crate) fn LLVMRustOffloadMapper<'a>(OldFn: &'a Value, NewFn: &'a Value);
}
}
@ -1680,7 +1688,17 @@ mod Offload_fallback {
/// Processes the module and writes it in an offload compatible way into a "host.out" file.
/// Marked as unsafe to match the real offload wrapper which is unsafe due to FFI.
#[allow(unused_unsafe)]
pub(crate) unsafe fn LLVMRustBundleImages<'a>(_M: &'a Module, _TM: &'a TargetMachine) -> bool {
pub(crate) unsafe fn LLVMRustBundleImages<'a>(
_M: &'a Module,
_TM: &'a TargetMachine,
_host_out: *const c_char,
) -> bool {
unimplemented!("This rustc version was not built with LLVM Offload support!");
}
pub(crate) unsafe fn LLVMRustOffloadEmbedBufferInModule<'a>(
_M: &'a Module,
_host_out: *const c_char,
) -> bool {
unimplemented!("This rustc version was not built with LLVM Offload support!");
}
#[allow(unused_unsafe)]

View file

@ -837,7 +837,7 @@ fn test_unstable_options_tracking_hash() {
tracked!(no_profiler_runtime, true);
tracked!(no_trait_vptr, true);
tracked!(no_unique_section_names, true);
tracked!(offload, vec![Offload::Enable]);
tracked!(offload, vec![Offload::Device]);
tracked!(on_broken_pipe, OnBrokenPipe::Kill);
tracked!(osx_rpath_install_name, true);
tracked!(packed_bundled_libs, true);

View file

@ -43,8 +43,10 @@
// available. As such, we only try to build it in the first place, if
// llvm.offload is enabled.
#ifdef OFFLOAD
#include "llvm/Bitcode/BitcodeReader.h"
#include "llvm/Object/OffloadBinary.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
#endif
// for raw `write` in the bad-alloc handler
@ -174,12 +176,13 @@ static Error writeFile(StringRef Filename, StringRef Data) {
// --image=file=device.bc,triple=amdgcn-amd-amdhsa,arch=gfx90a,kind=openmp
// The input module is the rust code compiled for a gpu target like amdgpu.
// Based on clang/tools/clang-offload-packager/ClangOffloadPackager.cpp
extern "C" bool LLVMRustBundleImages(LLVMModuleRef M, TargetMachine &TM) {
extern "C" bool LLVMRustBundleImages(LLVMModuleRef M, TargetMachine &TM,
const char *HostOutPath) {
std::string Storage;
llvm::raw_string_ostream OS1(Storage);
llvm::WriteBitcodeToFile(*unwrap(M), OS1);
OS1.flush();
auto MB = llvm::MemoryBuffer::getMemBufferCopy(Storage, "module.bc");
auto MB = llvm::MemoryBuffer::getMemBufferCopy(Storage, "device.bc");
SmallVector<char, 1024> BinaryData;
raw_svector_ostream OS2(BinaryData);
@ -188,19 +191,38 @@ extern "C" bool LLVMRustBundleImages(LLVMModuleRef M, TargetMachine &TM) {
ImageBinary.TheImageKind = object::IMG_Bitcode;
ImageBinary.Image = std::move(MB);
ImageBinary.TheOffloadKind = object::OFK_OpenMP;
ImageBinary.StringData["triple"] = TM.getTargetTriple().str();
ImageBinary.StringData["arch"] = TM.getTargetCPU();
std::string TripleStr = TM.getTargetTriple().str();
llvm::StringRef CPURef = TM.getTargetCPU();
ImageBinary.StringData["triple"] = TripleStr;
ImageBinary.StringData["arch"] = CPURef;
llvm::SmallString<0> Buffer = OffloadBinary::write(ImageBinary);
if (Buffer.size() % OffloadBinary::getAlignment() != 0)
// Offload binary has invalid size alignment
return false;
OS2 << Buffer;
if (Error E = writeFile("host.out",
if (Error E = writeFile(HostOutPath,
StringRef(BinaryData.begin(), BinaryData.size())))
return false;
return true;
}
extern "C" bool LLVMRustOffloadEmbedBufferInModule(LLVMModuleRef HostM,
const char *HostOutPath) {
auto MBOrErr = MemoryBuffer::getFile(HostOutPath);
if (!MBOrErr) {
auto E = MBOrErr.getError();
auto _B = errorCodeToError(E);
return false;
}
MemoryBufferRef Buf = (*MBOrErr)->getMemBufferRef();
Module *M = unwrap(HostM);
StringRef SectionName = ".llvm.offloading";
Align Alignment = Align(8);
llvm::embedBufferInModule(*M, Buf, SectionName, Alignment);
return true;
}
extern "C" void LLVMRustOffloadMapper(LLVMValueRef OldFn, LLVMValueRef NewFn) {
llvm::Function *oldFn = llvm::unwrap<llvm::Function>(OldFn);
llvm::Function *newFn = llvm::unwrap<llvm::Function>(NewFn);

View file

@ -190,10 +190,12 @@ pub enum CoverageLevel {
}
// The different settings that the `-Z offload` flag can have.
#[derive(Clone, Copy, PartialEq, Hash, Debug)]
#[derive(Clone, PartialEq, Hash, Debug)]
pub enum Offload {
/// Enable the llvm offload pipeline
Enable,
/// Entry point for `std::offload`, enables kernel compilation for a gpu device
Device,
/// Second step in the offload pipeline, generates the host code to call kernels.
Host(String),
}
/// The different settings that the `-Z autodiff` flag can have.
@ -2578,9 +2580,7 @@ pub fn build_session_options(early_dcx: &mut EarlyDiagCtxt, matches: &getopts::M
)
}
if !nightly_options::is_unstable_enabled(matches)
&& unstable_opts.offload.contains(&Offload::Enable)
{
if !nightly_options::is_unstable_enabled(matches) && !unstable_opts.offload.is_empty() {
early_dcx.early_fatal(
"`-Zoffload=Enable` also requires `-Zunstable-options` \
and a nightly compiler",

View file

@ -1451,8 +1451,27 @@ pub mod parse {
let mut v: Vec<&str> = v.split(",").collect();
v.sort_unstable();
for &val in v.iter() {
let variant = match val {
"Enable" => Offload::Enable,
// Split each entry on '=' if it has an argument
let (key, arg) = match val.split_once('=') {
Some((k, a)) => (k, Some(a)),
None => (val, None),
};
let variant = match key {
"Host" => {
if let Some(p) = arg {
Offload::Host(p.to_string())
} else {
return false;
}
}
"Device" => {
if let Some(_) = arg {
// Device does not accept a value
return false;
}
Offload::Device
}
_ => {
// FIXME(ZuseZ4): print an error saying which value is not recognized
return false;